]> git.saurik.com Git - redis.git/blame - redis.c
first implementation of HSET/HSET. More work needed
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
ed745154 30#define REDIS_VERSION "1.3.4"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
54bac49d 41#define __USE_UNIX98
ed9b544e 42#include <signal.h>
fbf9bcdb 43
44#ifdef HAVE_BACKTRACE
c9468bcf 45#include <execinfo.h>
46#include <ucontext.h>
fbf9bcdb 47#endif /* HAVE_BACKTRACE */
48
ed9b544e 49#include <sys/wait.h>
50#include <errno.h>
51#include <assert.h>
52#include <ctype.h>
53#include <stdarg.h>
54#include <inttypes.h>
55#include <arpa/inet.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#include <sys/time.h>
59#include <sys/resource.h>
2895e862 60#include <sys/uio.h>
f78fd11b 61#include <limits.h>
a7866db6 62#include <math.h>
92f8e882 63#include <pthread.h>
0bc1b2f6 64
65#if defined(__sun)
5043dff3 66#include "solarisfixes.h"
67#endif
ed9b544e 68
c9468bcf 69#include "redis.h"
ed9b544e 70#include "ae.h" /* Event driven programming library */
71#include "sds.h" /* Dynamic safe strings */
72#include "anet.h" /* Networking the easy way */
73#include "dict.h" /* Hash tables */
74#include "adlist.h" /* Linked lists */
75#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 76#include "lzf.h" /* LZF compression library */
77#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 78#include "zipmap.h"
ed9b544e 79
80/* Error codes */
81#define REDIS_OK 0
82#define REDIS_ERR -1
83
84/* Static server configuration */
85#define REDIS_SERVERPORT 6379 /* TCP port */
86#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 87#define REDIS_IOBUF_LEN 1024
ed9b544e 88#define REDIS_LOADBUF_LEN 1024
93ea3759 89#define REDIS_STATIC_ARGS 4
ed9b544e 90#define REDIS_DEFAULT_DBNUM 16
91#define REDIS_CONFIGLINE_MAX 1024
92#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94754ccc 94#define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
6f376729 95#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 96#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99#define REDIS_WRITEV_THRESHOLD 3
100/* Max number of iovecs used for each writev call */
101#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 102
103/* Hash table parameters */
104#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 105
106/* Command flags */
3fd78bcd 107#define REDIS_CMD_BULK 1 /* Bulk write command */
108#define REDIS_CMD_INLINE 2 /* Inline command */
109/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113#define REDIS_CMD_DENYOOM 4
ed9b544e 114
115/* Object types */
116#define REDIS_STRING 0
117#define REDIS_LIST 1
118#define REDIS_SET 2
1812e024 119#define REDIS_ZSET 3
120#define REDIS_HASH 4
f78fd11b 121
5234952b 122/* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
942a3961 125#define REDIS_ENCODING_RAW 0 /* Raw representation */
126#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 127#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 129
f78fd11b 130/* Object types only used for dumping to disk */
bb32ede5 131#define REDIS_EXPIRETIME 253
ed9b544e 132#define REDIS_SELECTDB 254
133#define REDIS_EOF 255
134
f78fd11b 135/* Defines related to the dump file format. To store 32 bits lengths for short
136 * keys requires a lot of space, so we check the most significant 2 bits of
137 * the first byte to interpreter the length:
138 *
139 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
140 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
141 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 142 * 11|000000 this means: specially encoded object will follow. The six bits
143 * number specify the kind of object that follows.
144 * See the REDIS_RDB_ENC_* defines.
f78fd11b 145 *
10c43610 146 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
147 * values, will fit inside. */
f78fd11b 148#define REDIS_RDB_6BITLEN 0
149#define REDIS_RDB_14BITLEN 1
150#define REDIS_RDB_32BITLEN 2
17be1a4a 151#define REDIS_RDB_ENCVAL 3
f78fd11b 152#define REDIS_RDB_LENERR UINT_MAX
153
a4d1ba9a 154/* When a length of a string object stored on disk has the first two bits
155 * set, the remaining two bits specify a special encoding for the object
156 * accordingly to the following defines: */
157#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
158#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
159#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 160#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 161
75680a3c 162/* Virtual memory object->where field. */
163#define REDIS_VM_MEMORY 0 /* The object is on memory */
164#define REDIS_VM_SWAPPED 1 /* The object is on disk */
165#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
166#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
167
06224fec 168/* Virtual memory static configuration stuff.
169 * Check vmFindContiguousPages() to know more about this magic numbers. */
170#define REDIS_VM_MAX_NEAR_PAGES 65536
171#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 172#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 173#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 174/* The following is the *percentage* of completed I/O jobs to process when the
175 * handelr is called. While Virtual Memory I/O operations are performed by
176 * threads, this operations must be processed by the main thread when completed
177 * in order to take effect. */
c953f24b 178#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 179
ed9b544e 180/* Client flags */
d5d55fc3 181#define REDIS_SLAVE 1 /* This client is a slave server */
182#define REDIS_MASTER 2 /* This client is a master server */
183#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
184#define REDIS_MULTI 8 /* This client is in a MULTI context */
185#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
186#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 187
40d224a9 188/* Slave replication state - slave side */
ed9b544e 189#define REDIS_REPL_NONE 0 /* No active replication */
190#define REDIS_REPL_CONNECT 1 /* Must connect to master */
191#define REDIS_REPL_CONNECTED 2 /* Connected to master */
192
40d224a9 193/* Slave replication state - from the point of view of master
194 * Note that in SEND_BULK and ONLINE state the slave receives new updates
195 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
196 * to start the next background saving in order to send updates to it. */
197#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
198#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
199#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
200#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
201
ed9b544e 202/* List related stuff */
203#define REDIS_HEAD 0
204#define REDIS_TAIL 1
205
206/* Sort operations */
207#define REDIS_SORT_GET 0
443c6409 208#define REDIS_SORT_ASC 1
209#define REDIS_SORT_DESC 2
ed9b544e 210#define REDIS_SORTKEY_MAX 1024
211
212/* Log levels */
213#define REDIS_DEBUG 0
f870935d 214#define REDIS_VERBOSE 1
215#define REDIS_NOTICE 2
216#define REDIS_WARNING 3
ed9b544e 217
218/* Anti-warning macro... */
219#define REDIS_NOTUSED(V) ((void) V)
220
6b47e12e 221#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
222#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 223
48f0308a 224/* Append only defines */
225#define APPENDFSYNC_NO 0
226#define APPENDFSYNC_ALWAYS 1
227#define APPENDFSYNC_EVERYSEC 2
228
dfc5e96c 229/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 230#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
6c96ba7d 231static void _redisAssert(char *estr, char *file, int line);
dfc5e96c 232
ed9b544e 233/*================================= Data types ============================== */
234
235/* A redis object, that is a type able to hold a string / list / set */
75680a3c 236
237/* The VM object structure */
238struct redisObjectVM {
3a66edc7 239 off_t page; /* the page at witch the object is stored on disk */
240 off_t usedpages; /* number of pages used on disk */
241 time_t atime; /* Last access time */
75680a3c 242} vm;
243
244/* The actual Redis Object */
ed9b544e 245typedef struct redisObject {
ed9b544e 246 void *ptr;
942a3961 247 unsigned char type;
248 unsigned char encoding;
d894161b 249 unsigned char storage; /* If this object is a key, where is the value?
250 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
251 unsigned char vtype; /* If this object is a key, and value is swapped out,
252 * this is the type of the swapped out object. */
ed9b544e 253 int refcount;
75680a3c 254 /* VM fields, this are only allocated if VM is active, otherwise the
255 * object allocation function will just allocate
256 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
257 * Redis without VM active will not have any overhead. */
258 struct redisObjectVM vm;
ed9b544e 259} robj;
260
dfc5e96c 261/* Macro used to initalize a Redis object allocated on the stack.
262 * Note that this macro is taken near the structure definition to make sure
263 * we'll update it when the structure is changed, to avoid bugs like
264 * bug #85 introduced exactly in this way. */
265#define initStaticStringObject(_var,_ptr) do { \
266 _var.refcount = 1; \
267 _var.type = REDIS_STRING; \
268 _var.encoding = REDIS_ENCODING_RAW; \
269 _var.ptr = _ptr; \
3a66edc7 270 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 271} while(0);
272
3305306f 273typedef struct redisDb {
4409877e 274 dict *dict; /* The keyspace for this DB */
275 dict *expires; /* Timeout of keys with a timeout set */
276 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 277 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 278 int id;
279} redisDb;
280
6e469882 281/* Client MULTI/EXEC state */
282typedef struct multiCmd {
283 robj **argv;
284 int argc;
285 struct redisCommand *cmd;
286} multiCmd;
287
288typedef struct multiState {
289 multiCmd *commands; /* Array of MULTI commands */
290 int count; /* Total number of MULTI commands */
291} multiState;
292
ed9b544e 293/* With multiplexing we need to take per-clinet state.
294 * Clients are taken in a liked list. */
295typedef struct redisClient {
296 int fd;
3305306f 297 redisDb *db;
ed9b544e 298 int dictid;
299 sds querybuf;
e8a74421 300 robj **argv, **mbargv;
301 int argc, mbargc;
40d224a9 302 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 303 int multibulk; /* multi bulk command format active */
ed9b544e 304 list *reply;
305 int sentlen;
306 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 307 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 308 int slaveseldb; /* slave selected db, if this client is a slave */
309 int authenticated; /* when requirepass is non-NULL */
310 int replstate; /* replication state if this is a slave */
311 int repldbfd; /* replication DB file descriptor */
6e469882 312 long repldboff; /* replication DB file offset */
40d224a9 313 off_t repldbsize; /* replication DB file size */
6e469882 314 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 315 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 316 * operation such as BLPOP. Otherwise NULL. */
b177fd30 317 int blockingkeysnum; /* Number of blocking keys */
4409877e 318 time_t blockingto; /* Blocking operation timeout. If UNIX current time
319 * is >= blockingto then the operation timed out. */
92f8e882 320 list *io_keys; /* Keys this client is waiting to be loaded from the
321 * swap file in order to continue. */
ed9b544e 322} redisClient;
323
324struct saveparam {
325 time_t seconds;
326 int changes;
327};
328
329/* Global server state structure */
330struct redisServer {
331 int port;
332 int fd;
3305306f 333 redisDb *db;
4409877e 334 dict *sharingpool; /* Poll used for object sharing */
10c43610 335 unsigned int sharingpoolsize;
ed9b544e 336 long long dirty; /* changes to DB from the last save */
337 list *clients;
87eca727 338 list *slaves, *monitors;
ed9b544e 339 char neterr[ANET_ERR_LEN];
340 aeEventLoop *el;
341 int cronloops; /* number of times the cron function run */
342 list *objfreelist; /* A list of freed objects to avoid malloc() */
343 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 344 /* Fields used only for stats */
345 time_t stat_starttime; /* server start time */
346 long long stat_numcommands; /* number of processed commands */
347 long long stat_numconnections; /* number of connections received */
348 /* Configuration */
349 int verbosity;
350 int glueoutputbuf;
351 int maxidletime;
352 int dbnum;
353 int daemonize;
44b38ef4 354 int appendonly;
48f0308a 355 int appendfsync;
356 time_t lastfsync;
44b38ef4 357 int appendfd;
358 int appendseldb;
ed329fcf 359 char *pidfile;
9f3c422c 360 pid_t bgsavechildpid;
9d65a1bb 361 pid_t bgrewritechildpid;
362 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 363 struct saveparam *saveparams;
364 int saveparamslen;
365 char *logfile;
366 char *bindaddr;
367 char *dbfilename;
44b38ef4 368 char *appendfilename;
abcb223e 369 char *requirepass;
10c43610 370 int shareobjects;
121f70cf 371 int rdbcompression;
ed9b544e 372 /* Replication related */
373 int isslave;
d0ccebcf 374 char *masterauth;
ed9b544e 375 char *masterhost;
376 int masterport;
40d224a9 377 redisClient *master; /* client that is master for this slave */
ed9b544e 378 int replstate;
285add55 379 unsigned int maxclients;
4ef8de8a 380 unsigned long long maxmemory;
d5d55fc3 381 unsigned int blpop_blocked_clients;
382 unsigned int vm_blocked_clients;
ed9b544e 383 /* Sort parameters - qsort_r() is only available under BSD so we
384 * have to take this state global, in order to pass it to sortCompare() */
385 int sort_desc;
386 int sort_alpha;
387 int sort_bypattern;
75680a3c 388 /* Virtual memory configuration */
389 int vm_enabled;
054e426d 390 char *vm_swap_file;
75680a3c 391 off_t vm_page_size;
392 off_t vm_pages;
4ef8de8a 393 unsigned long long vm_max_memory;
75680a3c 394 /* Virtual memory state */
395 FILE *vm_fp;
396 int vm_fd;
397 off_t vm_next_page; /* Next probably empty page */
398 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 399 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 400 time_t unixtime; /* Unix time sampled every second. */
92f8e882 401 /* Virtual memory I/O threads stuff */
92f8e882 402 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 403 * put the result of the operation in the io_done list. While the
404 * job is being processed, it's put on io_processing queue. */
405 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
406 list *io_processing; /* List of VM I/O jobs being processed */
407 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 408 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 409 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 410 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
411 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 412 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 413 int io_active_threads; /* Number of running I/O threads */
414 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 415 /* Our main thread is blocked on the event loop, locking for sockets ready
416 * to be read or written, so when a threaded I/O operation is ready to be
417 * processed by the main thread, the I/O thread will use a unix pipe to
418 * awake the main thread. The followings are the two pipe FDs. */
419 int io_ready_pipe_read;
420 int io_ready_pipe_write;
7d98e08c 421 /* Virtual memory stats */
422 unsigned long long vm_stats_used_pages;
423 unsigned long long vm_stats_swapped_objects;
424 unsigned long long vm_stats_swapouts;
425 unsigned long long vm_stats_swapins;
b9bc0eef 426 FILE *devnull;
ed9b544e 427};
428
429typedef void redisCommandProc(redisClient *c);
430struct redisCommand {
431 char *name;
432 redisCommandProc *proc;
433 int arity;
434 int flags;
7c775e09 435 /* What keys should be loaded in background when calling this command? */
436 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
437 int vm_lastkey; /* THe last argument that's a key */
438 int vm_keystep; /* The step between first and last key */
ed9b544e 439};
440
de96dbfe 441struct redisFunctionSym {
442 char *name;
56906eef 443 unsigned long pointer;
de96dbfe 444};
445
ed9b544e 446typedef struct _redisSortObject {
447 robj *obj;
448 union {
449 double score;
450 robj *cmpobj;
451 } u;
452} redisSortObject;
453
454typedef struct _redisSortOperation {
455 int type;
456 robj *pattern;
457} redisSortOperation;
458
6b47e12e 459/* ZSETs use a specialized version of Skiplists */
460
461typedef struct zskiplistNode {
462 struct zskiplistNode **forward;
e3870fab 463 struct zskiplistNode *backward;
912b9165 464 unsigned int *span;
6b47e12e 465 double score;
466 robj *obj;
467} zskiplistNode;
468
469typedef struct zskiplist {
e3870fab 470 struct zskiplistNode *header, *tail;
d13f767c 471 unsigned long length;
6b47e12e 472 int level;
473} zskiplist;
474
1812e024 475typedef struct zset {
476 dict *dict;
6b47e12e 477 zskiplist *zsl;
1812e024 478} zset;
479
6b47e12e 480/* Our shared "common" objects */
481
ed9b544e 482struct sharedObjectsStruct {
c937aa89 483 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 484 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 485 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
486 *outofrangeerr, *plus,
ed9b544e 487 *select0, *select1, *select2, *select3, *select4,
488 *select5, *select6, *select7, *select8, *select9;
489} shared;
490
a7866db6 491/* Global vars that are actally used as constants. The following double
492 * values are used for double on-disk serialization, and are initialized
493 * at runtime to avoid strange compiler optimizations. */
494
495static double R_Zero, R_PosInf, R_NegInf, R_Nan;
496
92f8e882 497/* VM threaded I/O request message */
b9bc0eef 498#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
499#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
500#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 501typedef struct iojob {
996cb5f7 502 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 503 redisDb *db;/* Redis database */
92f8e882 504 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 505 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 506 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
507 off_t page; /* Swap page where to read/write the object */
b9bc0eef 508 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
996cb5f7 509 int canceled; /* True if this command was canceled by blocking side of VM */
510 pthread_t thread; /* ID of the thread processing this entry */
511} iojob;
92f8e882 512
ed9b544e 513/*================================ Prototypes =============================== */
514
515static void freeStringObject(robj *o);
516static void freeListObject(robj *o);
517static void freeSetObject(robj *o);
518static void decrRefCount(void *o);
519static robj *createObject(int type, void *ptr);
520static void freeClient(redisClient *c);
f78fd11b 521static int rdbLoad(char *filename);
ed9b544e 522static void addReply(redisClient *c, robj *obj);
523static void addReplySds(redisClient *c, sds s);
524static void incrRefCount(robj *o);
f78fd11b 525static int rdbSaveBackground(char *filename);
ed9b544e 526static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 527static robj *dupStringObject(robj *o);
87eca727 528static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
44b38ef4 529static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 530static int syncWithMaster(void);
10c43610 531static robj *tryObjectSharing(robj *o);
942a3961 532static int tryObjectEncoding(robj *o);
9d65a1bb 533static robj *getDecodedObject(robj *o);
3305306f 534static int removeExpire(redisDb *db, robj *key);
535static int expireIfNeeded(redisDb *db, robj *key);
536static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 537static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 538static int deleteKey(redisDb *db, robj *key);
bb32ede5 539static time_t getExpire(redisDb *db, robj *key);
540static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 541static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 542static void freeMemoryIfNeeded(void);
de96dbfe 543static int processCommand(redisClient *c);
56906eef 544static void setupSigSegvAction(void);
a3b21203 545static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 546static void aofRemoveTempFile(pid_t childpid);
0ea663ea 547static size_t stringObjectLen(robj *o);
638e42ac 548static void processInputBuffer(redisClient *c);
6b47e12e 549static zskiplist *zslCreate(void);
fd8ccf44 550static void zslFree(zskiplist *zsl);
2b59cfdf 551static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 552static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 553static void initClientMultiState(redisClient *c);
554static void freeClientMultiState(redisClient *c);
555static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 556static void unblockClientWaitingData(redisClient *c);
4409877e 557static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 558static void vmInit(void);
a35ddf12 559static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 560static robj *vmLoadObject(robj *key);
7e69548d 561static robj *vmPreviewObject(robj *key);
a69a0c9c 562static int vmSwapOneObjectBlocking(void);
563static int vmSwapOneObjectThreaded(void);
7e69548d 564static int vmCanSwapOut(void);
a5819310 565static int tryFreeOneObjectFromFreelist(void);
996cb5f7 566static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
567static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
568static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 569static void lockThreadedIO(void);
570static void unlockThreadedIO(void);
571static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
572static void freeIOJob(iojob *j);
573static void queueIOJob(iojob *j);
a5819310 574static int vmWriteObjectOnSwap(robj *o, off_t page);
575static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 576static void waitEmptyIOJobsQueue(void);
577static void vmReopenSwapFile(void);
970e10bb 578static int vmFreePage(off_t page);
d5d55fc3 579static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
580static int dontWaitForSwappedKey(redisClient *c, robj *key);
581static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
582static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
583static struct redisCommand *lookupCommand(char *name);
584static void call(redisClient *c, struct redisCommand *cmd);
585static void resetClient(redisClient *c);
ed9b544e 586
abcb223e 587static void authCommand(redisClient *c);
ed9b544e 588static void pingCommand(redisClient *c);
589static void echoCommand(redisClient *c);
590static void setCommand(redisClient *c);
591static void setnxCommand(redisClient *c);
592static void getCommand(redisClient *c);
593static void delCommand(redisClient *c);
594static void existsCommand(redisClient *c);
595static void incrCommand(redisClient *c);
596static void decrCommand(redisClient *c);
597static void incrbyCommand(redisClient *c);
598static void decrbyCommand(redisClient *c);
599static void selectCommand(redisClient *c);
600static void randomkeyCommand(redisClient *c);
601static void keysCommand(redisClient *c);
602static void dbsizeCommand(redisClient *c);
603static void lastsaveCommand(redisClient *c);
604static void saveCommand(redisClient *c);
605static void bgsaveCommand(redisClient *c);
9d65a1bb 606static void bgrewriteaofCommand(redisClient *c);
ed9b544e 607static void shutdownCommand(redisClient *c);
608static void moveCommand(redisClient *c);
609static void renameCommand(redisClient *c);
610static void renamenxCommand(redisClient *c);
611static void lpushCommand(redisClient *c);
612static void rpushCommand(redisClient *c);
613static void lpopCommand(redisClient *c);
614static void rpopCommand(redisClient *c);
615static void llenCommand(redisClient *c);
616static void lindexCommand(redisClient *c);
617static void lrangeCommand(redisClient *c);
618static void ltrimCommand(redisClient *c);
619static void typeCommand(redisClient *c);
620static void lsetCommand(redisClient *c);
621static void saddCommand(redisClient *c);
622static void sremCommand(redisClient *c);
a4460ef4 623static void smoveCommand(redisClient *c);
ed9b544e 624static void sismemberCommand(redisClient *c);
625static void scardCommand(redisClient *c);
12fea928 626static void spopCommand(redisClient *c);
2abb95a9 627static void srandmemberCommand(redisClient *c);
ed9b544e 628static void sinterCommand(redisClient *c);
629static void sinterstoreCommand(redisClient *c);
40d224a9 630static void sunionCommand(redisClient *c);
631static void sunionstoreCommand(redisClient *c);
f4f56e1d 632static void sdiffCommand(redisClient *c);
633static void sdiffstoreCommand(redisClient *c);
ed9b544e 634static void syncCommand(redisClient *c);
635static void flushdbCommand(redisClient *c);
636static void flushallCommand(redisClient *c);
637static void sortCommand(redisClient *c);
638static void lremCommand(redisClient *c);
0f5f7e9a 639static void rpoplpushcommand(redisClient *c);
ed9b544e 640static void infoCommand(redisClient *c);
70003d28 641static void mgetCommand(redisClient *c);
87eca727 642static void monitorCommand(redisClient *c);
3305306f 643static void expireCommand(redisClient *c);
802e8373 644static void expireatCommand(redisClient *c);
f6b141c5 645static void getsetCommand(redisClient *c);
fd88489a 646static void ttlCommand(redisClient *c);
321b0e13 647static void slaveofCommand(redisClient *c);
7f957c92 648static void debugCommand(redisClient *c);
f6b141c5 649static void msetCommand(redisClient *c);
650static void msetnxCommand(redisClient *c);
fd8ccf44 651static void zaddCommand(redisClient *c);
7db723ad 652static void zincrbyCommand(redisClient *c);
cc812361 653static void zrangeCommand(redisClient *c);
50c55df5 654static void zrangebyscoreCommand(redisClient *c);
f44dd428 655static void zcountCommand(redisClient *c);
e3870fab 656static void zrevrangeCommand(redisClient *c);
3c41331e 657static void zcardCommand(redisClient *c);
1b7106e7 658static void zremCommand(redisClient *c);
6e333bbe 659static void zscoreCommand(redisClient *c);
1807985b 660static void zremrangebyscoreCommand(redisClient *c);
6e469882 661static void multiCommand(redisClient *c);
662static void execCommand(redisClient *c);
18b6cb76 663static void discardCommand(redisClient *c);
4409877e 664static void blpopCommand(redisClient *c);
665static void brpopCommand(redisClient *c);
4b00bebd 666static void appendCommand(redisClient *c);
39191553 667static void substrCommand(redisClient *c);
69d95c3e 668static void zrankCommand(redisClient *c);
978c2c94 669static void hsetCommand(redisClient *c);
670static void hgetCommand(redisClient *c);
f6b141c5 671
ed9b544e 672/*================================= Globals ================================= */
673
674/* Global vars */
675static struct redisServer server; /* server global state */
676static struct redisCommand cmdTable[] = {
7c775e09 677 {"get",getCommand,2,REDIS_CMD_INLINE,1,1,1},
678 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
679 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
680 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
39191553 681 {"substr",substrCommand,4,REDIS_CMD_INLINE,1,1,1},
7c775e09 682 {"del",delCommand,-2,REDIS_CMD_INLINE,0,0,0},
683 {"exists",existsCommand,2,REDIS_CMD_INLINE,1,1,1},
684 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
685 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
686 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,1,-1,1},
687 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
688 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
689 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,1,1,1},
690 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,1,1,1},
691 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
692 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
693 {"llen",llenCommand,2,REDIS_CMD_INLINE,1,1,1},
694 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,1,1,1},
695 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
696 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,1,1,1},
697 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,1,1,1},
698 {"lrem",lremCommand,4,REDIS_CMD_BULK,1,1,1},
699 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,2,1},
700 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
701 {"srem",sremCommand,3,REDIS_CMD_BULK,1,1,1},
702 {"smove",smoveCommand,4,REDIS_CMD_BULK,1,2,1},
703 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,1,1,1},
704 {"scard",scardCommand,2,REDIS_CMD_INLINE,1,1,1},
705 {"spop",spopCommand,2,REDIS_CMD_INLINE,1,1,1},
706 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,1,1,1},
707 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
708 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
709 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
710 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
711 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
712 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
713 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,1,1,1},
714 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
715 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
716 {"zrem",zremCommand,3,REDIS_CMD_BULK,1,1,1},
717 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,1,1,1},
718 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
719 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,1,1,1},
720 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,1,1,1},
721 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
722 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,1,1,1},
723 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
69d95c3e 724 {"zrank",zrankCommand,3,REDIS_CMD_INLINE,1,1,1},
978c2c94 725 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
726 {"hget",hgetCommand,3,REDIS_CMD_BULK,1,1,1},
7c775e09 727 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
728 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
729 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
730 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
731 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
732 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,0,0,0},
733 {"select",selectCommand,2,REDIS_CMD_INLINE,0,0,0},
734 {"move",moveCommand,3,REDIS_CMD_INLINE,1,1,1},
735 {"rename",renameCommand,3,REDIS_CMD_INLINE,1,1,1},
736 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,1,1,1},
737 {"expire",expireCommand,3,REDIS_CMD_INLINE,0,0,0},
738 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,0,0,0},
739 {"keys",keysCommand,2,REDIS_CMD_INLINE,0,0,0},
740 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,0,0,0},
741 {"auth",authCommand,2,REDIS_CMD_INLINE,0,0,0},
742 {"ping",pingCommand,1,REDIS_CMD_INLINE,0,0,0},
743 {"echo",echoCommand,2,REDIS_CMD_BULK,0,0,0},
744 {"save",saveCommand,1,REDIS_CMD_INLINE,0,0,0},
745 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
746 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,0,0,0},
747 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,0,0,0},
748 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
749 {"type",typeCommand,2,REDIS_CMD_INLINE,1,1,1},
750 {"multi",multiCommand,1,REDIS_CMD_INLINE,0,0,0},
751 {"exec",execCommand,1,REDIS_CMD_INLINE,0,0,0},
18b6cb76 752 {"discard",discardCommand,1,REDIS_CMD_INLINE,0,0,0},
7c775e09 753 {"sync",syncCommand,1,REDIS_CMD_INLINE,0,0,0},
754 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,0,0,0},
755 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,0,0,0},
756 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
757 {"info",infoCommand,1,REDIS_CMD_INLINE,0,0,0},
758 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,0,0,0},
759 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,1,1,1},
760 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,0,0,0},
761 {"debug",debugCommand,-2,REDIS_CMD_INLINE,0,0,0},
762 {NULL,NULL,0,0,0,0,0}
ed9b544e 763};
bcfc686d 764
ed9b544e 765/*============================ Utility functions ============================ */
766
767/* Glob-style pattern matching. */
768int stringmatchlen(const char *pattern, int patternLen,
769 const char *string, int stringLen, int nocase)
770{
771 while(patternLen) {
772 switch(pattern[0]) {
773 case '*':
774 while (pattern[1] == '*') {
775 pattern++;
776 patternLen--;
777 }
778 if (patternLen == 1)
779 return 1; /* match */
780 while(stringLen) {
781 if (stringmatchlen(pattern+1, patternLen-1,
782 string, stringLen, nocase))
783 return 1; /* match */
784 string++;
785 stringLen--;
786 }
787 return 0; /* no match */
788 break;
789 case '?':
790 if (stringLen == 0)
791 return 0; /* no match */
792 string++;
793 stringLen--;
794 break;
795 case '[':
796 {
797 int not, match;
798
799 pattern++;
800 patternLen--;
801 not = pattern[0] == '^';
802 if (not) {
803 pattern++;
804 patternLen--;
805 }
806 match = 0;
807 while(1) {
808 if (pattern[0] == '\\') {
809 pattern++;
810 patternLen--;
811 if (pattern[0] == string[0])
812 match = 1;
813 } else if (pattern[0] == ']') {
814 break;
815 } else if (patternLen == 0) {
816 pattern--;
817 patternLen++;
818 break;
819 } else if (pattern[1] == '-' && patternLen >= 3) {
820 int start = pattern[0];
821 int end = pattern[2];
822 int c = string[0];
823 if (start > end) {
824 int t = start;
825 start = end;
826 end = t;
827 }
828 if (nocase) {
829 start = tolower(start);
830 end = tolower(end);
831 c = tolower(c);
832 }
833 pattern += 2;
834 patternLen -= 2;
835 if (c >= start && c <= end)
836 match = 1;
837 } else {
838 if (!nocase) {
839 if (pattern[0] == string[0])
840 match = 1;
841 } else {
842 if (tolower((int)pattern[0]) == tolower((int)string[0]))
843 match = 1;
844 }
845 }
846 pattern++;
847 patternLen--;
848 }
849 if (not)
850 match = !match;
851 if (!match)
852 return 0; /* no match */
853 string++;
854 stringLen--;
855 break;
856 }
857 case '\\':
858 if (patternLen >= 2) {
859 pattern++;
860 patternLen--;
861 }
862 /* fall through */
863 default:
864 if (!nocase) {
865 if (pattern[0] != string[0])
866 return 0; /* no match */
867 } else {
868 if (tolower((int)pattern[0]) != tolower((int)string[0]))
869 return 0; /* no match */
870 }
871 string++;
872 stringLen--;
873 break;
874 }
875 pattern++;
876 patternLen--;
877 if (stringLen == 0) {
878 while(*pattern == '*') {
879 pattern++;
880 patternLen--;
881 }
882 break;
883 }
884 }
885 if (patternLen == 0 && stringLen == 0)
886 return 1;
887 return 0;
888}
889
56906eef 890static void redisLog(int level, const char *fmt, ...) {
ed9b544e 891 va_list ap;
892 FILE *fp;
893
894 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
895 if (!fp) return;
896
897 va_start(ap, fmt);
898 if (level >= server.verbosity) {
6766f45e 899 char *c = ".-*#";
1904ecc1 900 char buf[64];
901 time_t now;
902
903 now = time(NULL);
6c9385e0 904 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 905 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 906 vfprintf(fp, fmt, ap);
907 fprintf(fp,"\n");
908 fflush(fp);
909 }
910 va_end(ap);
911
912 if (server.logfile) fclose(fp);
913}
914
915/*====================== Hash table type implementation ==================== */
916
917/* This is an hash table type that uses the SDS dynamic strings libary as
918 * keys and radis objects as values (objects can hold SDS strings,
919 * lists, sets). */
920
1812e024 921static void dictVanillaFree(void *privdata, void *val)
922{
923 DICT_NOTUSED(privdata);
924 zfree(val);
925}
926
4409877e 927static void dictListDestructor(void *privdata, void *val)
928{
929 DICT_NOTUSED(privdata);
930 listRelease((list*)val);
931}
932
ed9b544e 933static int sdsDictKeyCompare(void *privdata, const void *key1,
934 const void *key2)
935{
936 int l1,l2;
937 DICT_NOTUSED(privdata);
938
939 l1 = sdslen((sds)key1);
940 l2 = sdslen((sds)key2);
941 if (l1 != l2) return 0;
942 return memcmp(key1, key2, l1) == 0;
943}
944
945static void dictRedisObjectDestructor(void *privdata, void *val)
946{
947 DICT_NOTUSED(privdata);
948
a35ddf12 949 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 950 decrRefCount(val);
951}
952
942a3961 953static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 954 const void *key2)
955{
956 const robj *o1 = key1, *o2 = key2;
957 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
958}
959
942a3961 960static unsigned int dictObjHash(const void *key) {
ed9b544e 961 const robj *o = key;
962 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
963}
964
942a3961 965static int dictEncObjKeyCompare(void *privdata, const void *key1,
966 const void *key2)
967{
9d65a1bb 968 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
969 int cmp;
942a3961 970
9d65a1bb 971 o1 = getDecodedObject(o1);
972 o2 = getDecodedObject(o2);
973 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
974 decrRefCount(o1);
975 decrRefCount(o2);
976 return cmp;
942a3961 977}
978
979static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 980 robj *o = (robj*) key;
942a3961 981
ed9e4966 982 if (o->encoding == REDIS_ENCODING_RAW) {
983 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
984 } else {
985 if (o->encoding == REDIS_ENCODING_INT) {
986 char buf[32];
987 int len;
988
989 len = snprintf(buf,32,"%ld",(long)o->ptr);
990 return dictGenHashFunction((unsigned char*)buf, len);
991 } else {
992 unsigned int hash;
993
994 o = getDecodedObject(o);
995 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
996 decrRefCount(o);
997 return hash;
998 }
999 }
942a3961 1000}
1001
f2d9f50f 1002/* Sets type and expires */
ed9b544e 1003static dictType setDictType = {
942a3961 1004 dictEncObjHash, /* hash function */
ed9b544e 1005 NULL, /* key dup */
1006 NULL, /* val dup */
942a3961 1007 dictEncObjKeyCompare, /* key compare */
ed9b544e 1008 dictRedisObjectDestructor, /* key destructor */
1009 NULL /* val destructor */
1010};
1011
f2d9f50f 1012/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1013static dictType zsetDictType = {
1014 dictEncObjHash, /* hash function */
1015 NULL, /* key dup */
1016 NULL, /* val dup */
1017 dictEncObjKeyCompare, /* key compare */
1018 dictRedisObjectDestructor, /* key destructor */
da0a1620 1019 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1020};
1021
f2d9f50f 1022/* Db->dict */
5234952b 1023static dictType dbDictType = {
942a3961 1024 dictObjHash, /* hash function */
ed9b544e 1025 NULL, /* key dup */
1026 NULL, /* val dup */
942a3961 1027 dictObjKeyCompare, /* key compare */
ed9b544e 1028 dictRedisObjectDestructor, /* key destructor */
1029 dictRedisObjectDestructor /* val destructor */
1030};
1031
f2d9f50f 1032/* Db->expires */
1033static dictType keyptrDictType = {
1034 dictObjHash, /* hash function */
1035 NULL, /* key dup */
1036 NULL, /* val dup */
1037 dictObjKeyCompare, /* key compare */
1038 dictRedisObjectDestructor, /* key destructor */
1039 NULL /* val destructor */
1040};
1041
5234952b 1042/* Hash type hash table (note that small hashes are represented with zimpaps) */
1043static dictType hashDictType = {
1044 dictEncObjHash, /* hash function */
1045 NULL, /* key dup */
1046 NULL, /* val dup */
1047 dictEncObjKeyCompare, /* key compare */
1048 dictRedisObjectDestructor, /* key destructor */
1049 dictRedisObjectDestructor /* val destructor */
1050};
1051
4409877e 1052/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1053 * lists as values. It's used for blocking operations (BLPOP) and to
1054 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1055static dictType keylistDictType = {
1056 dictObjHash, /* hash function */
1057 NULL, /* key dup */
1058 NULL, /* val dup */
1059 dictObjKeyCompare, /* key compare */
1060 dictRedisObjectDestructor, /* key destructor */
1061 dictListDestructor /* val destructor */
1062};
1063
ed9b544e 1064/* ========================= Random utility functions ======================= */
1065
1066/* Redis generally does not try to recover from out of memory conditions
1067 * when allocating objects or strings, it is not clear if it will be possible
1068 * to report this condition to the client since the networking layer itself
1069 * is based on heap allocation for send buffers, so we simply abort.
1070 * At least the code will be simpler to read... */
1071static void oom(const char *msg) {
71c54b21 1072 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1073 sleep(1);
1074 abort();
1075}
1076
1077/* ====================== Redis server networking stuff ===================== */
56906eef 1078static void closeTimedoutClients(void) {
ed9b544e 1079 redisClient *c;
ed9b544e 1080 listNode *ln;
1081 time_t now = time(NULL);
c7df85a4 1082 listIter li;
ed9b544e 1083
c7df85a4 1084 listRewind(server.clients,&li);
1085 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1086 c = listNodeValue(ln);
f86a74e9 1087 if (server.maxidletime &&
1088 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1089 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
f86a74e9 1090 (now - c->lastinteraction > server.maxidletime))
1091 {
f870935d 1092 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1093 freeClient(c);
f86a74e9 1094 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1095 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1096 addReply(c,shared.nullmultibulk);
b0d8747d 1097 unblockClientWaitingData(c);
f86a74e9 1098 }
ed9b544e 1099 }
1100 }
ed9b544e 1101}
1102
12fea928 1103static int htNeedsResize(dict *dict) {
1104 long long size, used;
1105
1106 size = dictSlots(dict);
1107 used = dictSize(dict);
1108 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1109 (used*100/size < REDIS_HT_MINFILL));
1110}
1111
0bc03378 1112/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1113 * we resize the hash table to save memory */
56906eef 1114static void tryResizeHashTables(void) {
0bc03378 1115 int j;
1116
1117 for (j = 0; j < server.dbnum; j++) {
12fea928 1118 if (htNeedsResize(server.db[j].dict)) {
f870935d 1119 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
0bc03378 1120 dictResize(server.db[j].dict);
f870935d 1121 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
0bc03378 1122 }
12fea928 1123 if (htNeedsResize(server.db[j].expires))
1124 dictResize(server.db[j].expires);
0bc03378 1125 }
1126}
1127
9d65a1bb 1128/* A background saving child (BGSAVE) terminated its work. Handle this. */
1129void backgroundSaveDoneHandler(int statloc) {
1130 int exitcode = WEXITSTATUS(statloc);
1131 int bysignal = WIFSIGNALED(statloc);
1132
1133 if (!bysignal && exitcode == 0) {
1134 redisLog(REDIS_NOTICE,
1135 "Background saving terminated with success");
1136 server.dirty = 0;
1137 server.lastsave = time(NULL);
1138 } else if (!bysignal && exitcode != 0) {
1139 redisLog(REDIS_WARNING, "Background saving error");
1140 } else {
1141 redisLog(REDIS_WARNING,
1142 "Background saving terminated by signal");
1143 rdbRemoveTempFile(server.bgsavechildpid);
1144 }
1145 server.bgsavechildpid = -1;
1146 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1147 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1148 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1149}
1150
1151/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1152 * Handle this. */
1153void backgroundRewriteDoneHandler(int statloc) {
1154 int exitcode = WEXITSTATUS(statloc);
1155 int bysignal = WIFSIGNALED(statloc);
1156
1157 if (!bysignal && exitcode == 0) {
1158 int fd;
1159 char tmpfile[256];
1160
1161 redisLog(REDIS_NOTICE,
1162 "Background append only file rewriting terminated with success");
1163 /* Now it's time to flush the differences accumulated by the parent */
1164 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1165 fd = open(tmpfile,O_WRONLY|O_APPEND);
1166 if (fd == -1) {
1167 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1168 goto cleanup;
1169 }
1170 /* Flush our data... */
1171 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1172 (signed) sdslen(server.bgrewritebuf)) {
1173 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1174 close(fd);
1175 goto cleanup;
1176 }
b32627cd 1177 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1178 /* Now our work is to rename the temp file into the stable file. And
1179 * switch the file descriptor used by the server for append only. */
1180 if (rename(tmpfile,server.appendfilename) == -1) {
1181 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1182 close(fd);
1183 goto cleanup;
1184 }
1185 /* Mission completed... almost */
1186 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1187 if (server.appendfd != -1) {
1188 /* If append only is actually enabled... */
1189 close(server.appendfd);
1190 server.appendfd = fd;
1191 fsync(fd);
85a83172 1192 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1193 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1194 } else {
1195 /* If append only is disabled we just generate a dump in this
1196 * format. Why not? */
1197 close(fd);
1198 }
1199 } else if (!bysignal && exitcode != 0) {
1200 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1201 } else {
1202 redisLog(REDIS_WARNING,
1203 "Background append only file rewriting terminated by signal");
1204 }
1205cleanup:
1206 sdsfree(server.bgrewritebuf);
1207 server.bgrewritebuf = sdsempty();
1208 aofRemoveTempFile(server.bgrewritechildpid);
1209 server.bgrewritechildpid = -1;
1210}
1211
56906eef 1212static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1213 int j, loops = server.cronloops++;
ed9b544e 1214 REDIS_NOTUSED(eventLoop);
1215 REDIS_NOTUSED(id);
1216 REDIS_NOTUSED(clientData);
1217
3a66edc7 1218 /* We take a cached value of the unix time in the global state because
1219 * with virtual memory and aging there is to store the current time
1220 * in objects at every object access, and accuracy is not needed.
1221 * To access a global var is faster than calling time(NULL) */
1222 server.unixtime = time(NULL);
1223
0bc03378 1224 /* Show some info about non-empty databases */
ed9b544e 1225 for (j = 0; j < server.dbnum; j++) {
dec423d9 1226 long long size, used, vkeys;
94754ccc 1227
3305306f 1228 size = dictSlots(server.db[j].dict);
1229 used = dictSize(server.db[j].dict);
94754ccc 1230 vkeys = dictSize(server.db[j].expires);
c3cb078d 1231 if (!(loops % 5) && (used || vkeys)) {
f870935d 1232 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1233 /* dictPrintStats(server.dict); */
ed9b544e 1234 }
ed9b544e 1235 }
1236
0bc03378 1237 /* We don't want to resize the hash tables while a bacground saving
1238 * is in progress: the saving child is created using fork() that is
1239 * implemented with a copy-on-write semantic in most modern systems, so
1240 * if we resize the HT while there is the saving child at work actually
1241 * a lot of memory movements in the parent will cause a lot of pages
1242 * copied. */
9d65a1bb 1243 if (server.bgsavechildpid == -1) tryResizeHashTables();
0bc03378 1244
ed9b544e 1245 /* Show information about connected clients */
1246 if (!(loops % 5)) {
f870935d 1247 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
ed9b544e 1248 listLength(server.clients)-listLength(server.slaves),
1249 listLength(server.slaves),
b72f6a4b 1250 zmalloc_used_memory(),
3305306f 1251 dictSize(server.sharingpool));
ed9b544e 1252 }
1253
1254 /* Close connections of timedout clients */
d5d55fc3 1255 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
ed9b544e 1256 closeTimedoutClients();
1257
9d65a1bb 1258 /* Check if a background saving or AOF rewrite in progress terminated */
1259 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1260 int statloc;
9d65a1bb 1261 pid_t pid;
1262
1263 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1264 if (pid == server.bgsavechildpid) {
1265 backgroundSaveDoneHandler(statloc);
ed9b544e 1266 } else {
9d65a1bb 1267 backgroundRewriteDoneHandler(statloc);
ed9b544e 1268 }
ed9b544e 1269 }
1270 } else {
1271 /* If there is not a background saving in progress check if
1272 * we have to save now */
1273 time_t now = time(NULL);
1274 for (j = 0; j < server.saveparamslen; j++) {
1275 struct saveparam *sp = server.saveparams+j;
1276
1277 if (server.dirty >= sp->changes &&
1278 now-server.lastsave > sp->seconds) {
1279 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1280 sp->changes, sp->seconds);
f78fd11b 1281 rdbSaveBackground(server.dbfilename);
ed9b544e 1282 break;
1283 }
1284 }
1285 }
94754ccc 1286
f2324293 1287 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1288 * will use few CPU cycles if there are few expiring keys, otherwise
1289 * it will get more aggressive to avoid that too much memory is used by
1290 * keys that can be removed from the keyspace. */
94754ccc 1291 for (j = 0; j < server.dbnum; j++) {
f2324293 1292 int expired;
94754ccc 1293 redisDb *db = server.db+j;
94754ccc 1294
f2324293 1295 /* Continue to expire if at the end of the cycle more than 25%
1296 * of the keys were expired. */
1297 do {
4ef8de8a 1298 long num = dictSize(db->expires);
94754ccc 1299 time_t now = time(NULL);
1300
f2324293 1301 expired = 0;
94754ccc 1302 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1303 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1304 while (num--) {
1305 dictEntry *de;
1306 time_t t;
1307
1308 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1309 t = (time_t) dictGetEntryVal(de);
1310 if (now > t) {
1311 deleteKey(db,dictGetEntryKey(de));
f2324293 1312 expired++;
94754ccc 1313 }
1314 }
f2324293 1315 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1316 }
1317
4ef8de8a 1318 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1319 * is enbled. Try to free objects from the free list first. */
7e69548d 1320 if (vmCanSwapOut()) {
1321 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1322 server.vm_max_memory)
1323 {
72e9fd40 1324 int retval;
1325
a5819310 1326 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1327 retval = (server.vm_max_threads == 0) ?
1328 vmSwapOneObjectBlocking() :
1329 vmSwapOneObjectThreaded();
1330 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1331 zmalloc_used_memory() >
1332 (server.vm_max_memory+server.vm_max_memory/10))
1333 {
1334 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1335 }
72e9fd40 1336 /* Note that when using threade I/O we free just one object,
1337 * because anyway when the I/O thread in charge to swap this
1338 * object out will finish, the handler of completed jobs
1339 * will try to swap more objects if we are still out of memory. */
1340 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1341 }
1342 }
1343
ed9b544e 1344 /* Check if we should connect to a MASTER */
1345 if (server.replstate == REDIS_REPL_CONNECT) {
1346 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1347 if (syncWithMaster() == REDIS_OK) {
1348 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1349 }
1350 }
1351 return 1000;
1352}
1353
d5d55fc3 1354/* This function gets called every time Redis is entering the
1355 * main loop of the event driven library, that is, before to sleep
1356 * for ready file descriptors. */
1357static void beforeSleep(struct aeEventLoop *eventLoop) {
1358 REDIS_NOTUSED(eventLoop);
1359
1360 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1361 listIter li;
1362 listNode *ln;
1363
1364 listRewind(server.io_ready_clients,&li);
1365 while((ln = listNext(&li))) {
1366 redisClient *c = ln->value;
1367 struct redisCommand *cmd;
1368
1369 /* Resume the client. */
1370 listDelNode(server.io_ready_clients,ln);
1371 c->flags &= (~REDIS_IO_WAIT);
1372 server.vm_blocked_clients--;
1373 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1374 readQueryFromClient, c);
1375 cmd = lookupCommand(c->argv[0]->ptr);
1376 assert(cmd != NULL);
1377 call(c,cmd);
1378 resetClient(c);
1379 /* There may be more data to process in the input buffer. */
1380 if (c->querybuf && sdslen(c->querybuf) > 0)
1381 processInputBuffer(c);
1382 }
1383 }
1384}
1385
ed9b544e 1386static void createSharedObjects(void) {
1387 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1388 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1389 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1390 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1391 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1392 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1393 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1394 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1395 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1396 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1397 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1398 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1399 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1400 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1401 "-ERR no such key\r\n"));
ed9b544e 1402 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1403 "-ERR syntax error\r\n"));
c937aa89 1404 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1405 "-ERR source and destination objects are the same\r\n"));
1406 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1407 "-ERR index out of range\r\n"));
ed9b544e 1408 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1409 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1410 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1411 shared.select0 = createStringObject("select 0\r\n",10);
1412 shared.select1 = createStringObject("select 1\r\n",10);
1413 shared.select2 = createStringObject("select 2\r\n",10);
1414 shared.select3 = createStringObject("select 3\r\n",10);
1415 shared.select4 = createStringObject("select 4\r\n",10);
1416 shared.select5 = createStringObject("select 5\r\n",10);
1417 shared.select6 = createStringObject("select 6\r\n",10);
1418 shared.select7 = createStringObject("select 7\r\n",10);
1419 shared.select8 = createStringObject("select 8\r\n",10);
1420 shared.select9 = createStringObject("select 9\r\n",10);
1421}
1422
1423static void appendServerSaveParams(time_t seconds, int changes) {
1424 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1425 server.saveparams[server.saveparamslen].seconds = seconds;
1426 server.saveparams[server.saveparamslen].changes = changes;
1427 server.saveparamslen++;
1428}
1429
bcfc686d 1430static void resetServerSaveParams() {
ed9b544e 1431 zfree(server.saveparams);
1432 server.saveparams = NULL;
1433 server.saveparamslen = 0;
1434}
1435
1436static void initServerConfig() {
1437 server.dbnum = REDIS_DEFAULT_DBNUM;
1438 server.port = REDIS_SERVERPORT;
f870935d 1439 server.verbosity = REDIS_VERBOSE;
ed9b544e 1440 server.maxidletime = REDIS_MAXIDLETIME;
1441 server.saveparams = NULL;
1442 server.logfile = NULL; /* NULL = log on standard output */
1443 server.bindaddr = NULL;
1444 server.glueoutputbuf = 1;
1445 server.daemonize = 0;
44b38ef4 1446 server.appendonly = 0;
4e141d5a 1447 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1448 server.lastfsync = time(NULL);
44b38ef4 1449 server.appendfd = -1;
1450 server.appendseldb = -1; /* Make sure the first time will not match */
ed329fcf 1451 server.pidfile = "/var/run/redis.pid";
ed9b544e 1452 server.dbfilename = "dump.rdb";
9d65a1bb 1453 server.appendfilename = "appendonly.aof";
abcb223e 1454 server.requirepass = NULL;
10c43610 1455 server.shareobjects = 0;
b0553789 1456 server.rdbcompression = 1;
21aecf4b 1457 server.sharingpoolsize = 1024;
285add55 1458 server.maxclients = 0;
d5d55fc3 1459 server.blpop_blocked_clients = 0;
3fd78bcd 1460 server.maxmemory = 0;
75680a3c 1461 server.vm_enabled = 0;
054e426d 1462 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1463 server.vm_page_size = 256; /* 256 bytes per page */
1464 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1465 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1466 server.vm_max_threads = 4;
d5d55fc3 1467 server.vm_blocked_clients = 0;
75680a3c 1468
bcfc686d 1469 resetServerSaveParams();
ed9b544e 1470
1471 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1472 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1473 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1474 /* Replication related */
1475 server.isslave = 0;
d0ccebcf 1476 server.masterauth = NULL;
ed9b544e 1477 server.masterhost = NULL;
1478 server.masterport = 6379;
1479 server.master = NULL;
1480 server.replstate = REDIS_REPL_NONE;
a7866db6 1481
1482 /* Double constants initialization */
1483 R_Zero = 0.0;
1484 R_PosInf = 1.0/R_Zero;
1485 R_NegInf = -1.0/R_Zero;
1486 R_Nan = R_Zero/R_Zero;
ed9b544e 1487}
1488
1489static void initServer() {
1490 int j;
1491
1492 signal(SIGHUP, SIG_IGN);
1493 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1494 setupSigSegvAction();
ed9b544e 1495
b9bc0eef 1496 server.devnull = fopen("/dev/null","w");
1497 if (server.devnull == NULL) {
1498 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1499 exit(1);
1500 }
ed9b544e 1501 server.clients = listCreate();
1502 server.slaves = listCreate();
87eca727 1503 server.monitors = listCreate();
ed9b544e 1504 server.objfreelist = listCreate();
1505 createSharedObjects();
1506 server.el = aeCreateEventLoop();
3305306f 1507 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
10c43610 1508 server.sharingpool = dictCreate(&setDictType,NULL);
ed9b544e 1509 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1510 if (server.fd == -1) {
1511 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1512 exit(1);
1513 }
3305306f 1514 for (j = 0; j < server.dbnum; j++) {
5234952b 1515 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1516 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1517 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1518 if (server.vm_enabled)
1519 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1520 server.db[j].id = j;
1521 }
ed9b544e 1522 server.cronloops = 0;
9f3c422c 1523 server.bgsavechildpid = -1;
9d65a1bb 1524 server.bgrewritechildpid = -1;
1525 server.bgrewritebuf = sdsempty();
ed9b544e 1526 server.lastsave = time(NULL);
1527 server.dirty = 0;
ed9b544e 1528 server.stat_numcommands = 0;
1529 server.stat_numconnections = 0;
1530 server.stat_starttime = time(NULL);
3a66edc7 1531 server.unixtime = time(NULL);
d8f8b666 1532 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1533 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1534 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1535
1536 if (server.appendonly) {
71eba477 1537 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1538 if (server.appendfd == -1) {
1539 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1540 strerror(errno));
1541 exit(1);
1542 }
1543 }
75680a3c 1544
1545 if (server.vm_enabled) vmInit();
ed9b544e 1546}
1547
1548/* Empty the whole database */
ca37e9cd 1549static long long emptyDb() {
ed9b544e 1550 int j;
ca37e9cd 1551 long long removed = 0;
ed9b544e 1552
3305306f 1553 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1554 removed += dictSize(server.db[j].dict);
3305306f 1555 dictEmpty(server.db[j].dict);
1556 dictEmpty(server.db[j].expires);
1557 }
ca37e9cd 1558 return removed;
ed9b544e 1559}
1560
85dd2f3a 1561static int yesnotoi(char *s) {
1562 if (!strcasecmp(s,"yes")) return 1;
1563 else if (!strcasecmp(s,"no")) return 0;
1564 else return -1;
1565}
1566
ed9b544e 1567/* I agree, this is a very rudimental way to load a configuration...
1568 will improve later if the config gets more complex */
1569static void loadServerConfig(char *filename) {
c9a111ac 1570 FILE *fp;
ed9b544e 1571 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1572 int linenum = 0;
1573 sds line = NULL;
c9a111ac 1574
1575 if (filename[0] == '-' && filename[1] == '\0')
1576 fp = stdin;
1577 else {
1578 if ((fp = fopen(filename,"r")) == NULL) {
1579 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1580 exit(1);
1581 }
ed9b544e 1582 }
c9a111ac 1583
ed9b544e 1584 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1585 sds *argv;
1586 int argc, j;
1587
1588 linenum++;
1589 line = sdsnew(buf);
1590 line = sdstrim(line," \t\r\n");
1591
1592 /* Skip comments and blank lines*/
1593 if (line[0] == '#' || line[0] == '\0') {
1594 sdsfree(line);
1595 continue;
1596 }
1597
1598 /* Split into arguments */
1599 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1600 sdstolower(argv[0]);
1601
1602 /* Execute config directives */
bb0b03a3 1603 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1604 server.maxidletime = atoi(argv[1]);
0150db36 1605 if (server.maxidletime < 0) {
ed9b544e 1606 err = "Invalid timeout value"; goto loaderr;
1607 }
bb0b03a3 1608 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1609 server.port = atoi(argv[1]);
1610 if (server.port < 1 || server.port > 65535) {
1611 err = "Invalid port"; goto loaderr;
1612 }
bb0b03a3 1613 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1614 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1615 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1616 int seconds = atoi(argv[1]);
1617 int changes = atoi(argv[2]);
1618 if (seconds < 1 || changes < 0) {
1619 err = "Invalid save parameters"; goto loaderr;
1620 }
1621 appendServerSaveParams(seconds,changes);
bb0b03a3 1622 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1623 if (chdir(argv[1]) == -1) {
1624 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1625 argv[1], strerror(errno));
1626 exit(1);
1627 }
bb0b03a3 1628 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1629 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1630 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1631 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1632 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1633 else {
1634 err = "Invalid log level. Must be one of debug, notice, warning";
1635 goto loaderr;
1636 }
bb0b03a3 1637 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1638 FILE *logfp;
ed9b544e 1639
1640 server.logfile = zstrdup(argv[1]);
bb0b03a3 1641 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1642 zfree(server.logfile);
1643 server.logfile = NULL;
1644 }
1645 if (server.logfile) {
1646 /* Test if we are able to open the file. The server will not
1647 * be able to abort just for this problem later... */
c9a111ac 1648 logfp = fopen(server.logfile,"a");
1649 if (logfp == NULL) {
ed9b544e 1650 err = sdscatprintf(sdsempty(),
1651 "Can't open the log file: %s", strerror(errno));
1652 goto loaderr;
1653 }
c9a111ac 1654 fclose(logfp);
ed9b544e 1655 }
bb0b03a3 1656 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1657 server.dbnum = atoi(argv[1]);
1658 if (server.dbnum < 1) {
1659 err = "Invalid number of databases"; goto loaderr;
1660 }
285add55 1661 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1662 server.maxclients = atoi(argv[1]);
3fd78bcd 1663 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1664 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1665 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1666 server.masterhost = sdsnew(argv[1]);
1667 server.masterport = atoi(argv[2]);
1668 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1669 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1670 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1671 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1672 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1673 err = "argument must be 'yes' or 'no'"; goto loaderr;
1674 }
bb0b03a3 1675 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1676 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1677 err = "argument must be 'yes' or 'no'"; goto loaderr;
1678 }
121f70cf 1679 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1680 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1681 err = "argument must be 'yes' or 'no'"; goto loaderr;
1682 }
e52c65b9 1683 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1684 server.sharingpoolsize = atoi(argv[1]);
1685 if (server.sharingpoolsize < 1) {
1686 err = "invalid object sharing pool size"; goto loaderr;
1687 }
bb0b03a3 1688 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1689 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1690 err = "argument must be 'yes' or 'no'"; goto loaderr;
1691 }
44b38ef4 1692 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1693 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1694 err = "argument must be 'yes' or 'no'"; goto loaderr;
1695 }
48f0308a 1696 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1697 if (!strcasecmp(argv[1],"no")) {
48f0308a 1698 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1699 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1700 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1701 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1702 server.appendfsync = APPENDFSYNC_EVERYSEC;
1703 } else {
1704 err = "argument must be 'no', 'always' or 'everysec'";
1705 goto loaderr;
1706 }
bb0b03a3 1707 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1708 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1709 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
054e426d 1710 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1711 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
054e426d 1712 server.dbfilename = zstrdup(argv[1]);
75680a3c 1713 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1714 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1715 err = "argument must be 'yes' or 'no'"; goto loaderr;
1716 }
054e426d 1717 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1718 zfree(server.vm_swap_file);
054e426d 1719 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1720 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1721 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1722 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1723 server.vm_page_size = strtoll(argv[1], NULL, 10);
1724 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1725 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1726 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1727 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1728 } else {
1729 err = "Bad directive or wrong number of arguments"; goto loaderr;
1730 }
1731 for (j = 0; j < argc; j++)
1732 sdsfree(argv[j]);
1733 zfree(argv);
1734 sdsfree(line);
1735 }
c9a111ac 1736 if (fp != stdin) fclose(fp);
ed9b544e 1737 return;
1738
1739loaderr:
1740 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1741 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1742 fprintf(stderr, ">>> '%s'\n", line);
1743 fprintf(stderr, "%s\n", err);
1744 exit(1);
1745}
1746
1747static void freeClientArgv(redisClient *c) {
1748 int j;
1749
1750 for (j = 0; j < c->argc; j++)
1751 decrRefCount(c->argv[j]);
e8a74421 1752 for (j = 0; j < c->mbargc; j++)
1753 decrRefCount(c->mbargv[j]);
ed9b544e 1754 c->argc = 0;
e8a74421 1755 c->mbargc = 0;
ed9b544e 1756}
1757
1758static void freeClient(redisClient *c) {
1759 listNode *ln;
1760
4409877e 1761 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1762 * call, we have to set querybuf to NULL *before* to call
1763 * unblockClientWaitingData() to avoid processInputBuffer() will get
1764 * called. Also it is important to remove the file events after
1765 * this, because this call adds the READABLE event. */
4409877e 1766 sdsfree(c->querybuf);
1767 c->querybuf = NULL;
1768 if (c->flags & REDIS_BLOCKED)
b0d8747d 1769 unblockClientWaitingData(c);
4409877e 1770
ed9b544e 1771 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1772 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1773 listRelease(c->reply);
1774 freeClientArgv(c);
1775 close(c->fd);
92f8e882 1776 /* Remove from the list of clients */
ed9b544e 1777 ln = listSearchKey(server.clients,c);
dfc5e96c 1778 redisAssert(ln != NULL);
ed9b544e 1779 listDelNode(server.clients,ln);
d5d55fc3 1780 /* Remove from the list of clients waiting for swapped keys */
1781 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1782 ln = listSearchKey(server.io_ready_clients,c);
1783 if (ln) {
1784 listDelNode(server.io_ready_clients,ln);
1785 server.vm_blocked_clients--;
1786 }
1787 }
1788 while (server.vm_enabled && listLength(c->io_keys)) {
1789 ln = listFirst(c->io_keys);
1790 dontWaitForSwappedKey(c,ln->value);
92f8e882 1791 }
b3e3d0d7 1792 listRelease(c->io_keys);
92f8e882 1793 /* Other cleanup */
ed9b544e 1794 if (c->flags & REDIS_SLAVE) {
6208b3a7 1795 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1796 close(c->repldbfd);
87eca727 1797 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1798 ln = listSearchKey(l,c);
dfc5e96c 1799 redisAssert(ln != NULL);
87eca727 1800 listDelNode(l,ln);
ed9b544e 1801 }
1802 if (c->flags & REDIS_MASTER) {
1803 server.master = NULL;
1804 server.replstate = REDIS_REPL_CONNECT;
1805 }
93ea3759 1806 zfree(c->argv);
e8a74421 1807 zfree(c->mbargv);
6e469882 1808 freeClientMultiState(c);
ed9b544e 1809 zfree(c);
1810}
1811
cc30e368 1812#define GLUEREPLY_UP_TO (1024)
ed9b544e 1813static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1814 int copylen = 0;
1815 char buf[GLUEREPLY_UP_TO];
6208b3a7 1816 listNode *ln;
c7df85a4 1817 listIter li;
ed9b544e 1818 robj *o;
1819
c7df85a4 1820 listRewind(c->reply,&li);
1821 while((ln = listNext(&li))) {
c28b42ac 1822 int objlen;
1823
ed9b544e 1824 o = ln->value;
c28b42ac 1825 objlen = sdslen(o->ptr);
1826 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1827 memcpy(buf+copylen,o->ptr,objlen);
1828 copylen += objlen;
ed9b544e 1829 listDelNode(c->reply,ln);
c28b42ac 1830 } else {
1831 if (copylen == 0) return;
1832 break;
ed9b544e 1833 }
ed9b544e 1834 }
c28b42ac 1835 /* Now the output buffer is empty, add the new single element */
1836 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1837 listAddNodeHead(c->reply,o);
ed9b544e 1838}
1839
1840static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1841 redisClient *c = privdata;
1842 int nwritten = 0, totwritten = 0, objlen;
1843 robj *o;
1844 REDIS_NOTUSED(el);
1845 REDIS_NOTUSED(mask);
1846
2895e862 1847 /* Use writev() if we have enough buffers to send */
7ea870c0 1848 if (!server.glueoutputbuf &&
1849 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1850 !(c->flags & REDIS_MASTER))
2895e862 1851 {
1852 sendReplyToClientWritev(el, fd, privdata, mask);
1853 return;
1854 }
2895e862 1855
ed9b544e 1856 while(listLength(c->reply)) {
c28b42ac 1857 if (server.glueoutputbuf && listLength(c->reply) > 1)
1858 glueReplyBuffersIfNeeded(c);
1859
ed9b544e 1860 o = listNodeValue(listFirst(c->reply));
1861 objlen = sdslen(o->ptr);
1862
1863 if (objlen == 0) {
1864 listDelNode(c->reply,listFirst(c->reply));
1865 continue;
1866 }
1867
1868 if (c->flags & REDIS_MASTER) {
6f376729 1869 /* Don't reply to a master */
ed9b544e 1870 nwritten = objlen - c->sentlen;
1871 } else {
a4d1ba9a 1872 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 1873 if (nwritten <= 0) break;
1874 }
1875 c->sentlen += nwritten;
1876 totwritten += nwritten;
1877 /* If we fully sent the object on head go to the next one */
1878 if (c->sentlen == objlen) {
1879 listDelNode(c->reply,listFirst(c->reply));
1880 c->sentlen = 0;
1881 }
6f376729 1882 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 1883 * bytes, in a single threaded server it's a good idea to serve
6f376729 1884 * other clients as well, even if a very large request comes from
1885 * super fast link that is always able to accept data (in real world
12f9d551 1886 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 1887 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 1888 }
1889 if (nwritten == -1) {
1890 if (errno == EAGAIN) {
1891 nwritten = 0;
1892 } else {
f870935d 1893 redisLog(REDIS_VERBOSE,
ed9b544e 1894 "Error writing to client: %s", strerror(errno));
1895 freeClient(c);
1896 return;
1897 }
1898 }
1899 if (totwritten > 0) c->lastinteraction = time(NULL);
1900 if (listLength(c->reply) == 0) {
1901 c->sentlen = 0;
1902 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1903 }
1904}
1905
2895e862 1906static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1907{
1908 redisClient *c = privdata;
1909 int nwritten = 0, totwritten = 0, objlen, willwrite;
1910 robj *o;
1911 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1912 int offset, ion = 0;
1913 REDIS_NOTUSED(el);
1914 REDIS_NOTUSED(mask);
1915
1916 listNode *node;
1917 while (listLength(c->reply)) {
1918 offset = c->sentlen;
1919 ion = 0;
1920 willwrite = 0;
1921
1922 /* fill-in the iov[] array */
1923 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1924 o = listNodeValue(node);
1925 objlen = sdslen(o->ptr);
1926
1927 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1928 break;
1929
1930 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1931 break; /* no more iovecs */
1932
1933 iov[ion].iov_base = ((char*)o->ptr) + offset;
1934 iov[ion].iov_len = objlen - offset;
1935 willwrite += objlen - offset;
1936 offset = 0; /* just for the first item */
1937 ion++;
1938 }
1939
1940 if(willwrite == 0)
1941 break;
1942
1943 /* write all collected blocks at once */
1944 if((nwritten = writev(fd, iov, ion)) < 0) {
1945 if (errno != EAGAIN) {
f870935d 1946 redisLog(REDIS_VERBOSE,
2895e862 1947 "Error writing to client: %s", strerror(errno));
1948 freeClient(c);
1949 return;
1950 }
1951 break;
1952 }
1953
1954 totwritten += nwritten;
1955 offset = c->sentlen;
1956
1957 /* remove written robjs from c->reply */
1958 while (nwritten && listLength(c->reply)) {
1959 o = listNodeValue(listFirst(c->reply));
1960 objlen = sdslen(o->ptr);
1961
1962 if(nwritten >= objlen - offset) {
1963 listDelNode(c->reply, listFirst(c->reply));
1964 nwritten -= objlen - offset;
1965 c->sentlen = 0;
1966 } else {
1967 /* partial write */
1968 c->sentlen += nwritten;
1969 break;
1970 }
1971 offset = 0;
1972 }
1973 }
1974
1975 if (totwritten > 0)
1976 c->lastinteraction = time(NULL);
1977
1978 if (listLength(c->reply) == 0) {
1979 c->sentlen = 0;
1980 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1981 }
1982}
1983
ed9b544e 1984static struct redisCommand *lookupCommand(char *name) {
1985 int j = 0;
1986 while(cmdTable[j].name != NULL) {
bb0b03a3 1987 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 1988 j++;
1989 }
1990 return NULL;
1991}
1992
1993/* resetClient prepare the client to process the next command */
1994static void resetClient(redisClient *c) {
1995 freeClientArgv(c);
1996 c->bulklen = -1;
e8a74421 1997 c->multibulk = 0;
ed9b544e 1998}
1999
6e469882 2000/* Call() is the core of Redis execution of a command */
2001static void call(redisClient *c, struct redisCommand *cmd) {
2002 long long dirty;
2003
2004 dirty = server.dirty;
2005 cmd->proc(c);
2006 if (server.appendonly && server.dirty-dirty)
2007 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2008 if (server.dirty-dirty && listLength(server.slaves))
2009 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2010 if (listLength(server.monitors))
2011 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2012 server.stat_numcommands++;
2013}
2014
ed9b544e 2015/* If this function gets called we already read a whole
2016 * command, argments are in the client argv/argc fields.
2017 * processCommand() execute the command or prepare the
2018 * server for a bulk read from the client.
2019 *
2020 * If 1 is returned the client is still alive and valid and
2021 * and other operations can be performed by the caller. Otherwise
2022 * if 0 is returned the client was destroied (i.e. after QUIT). */
2023static int processCommand(redisClient *c) {
2024 struct redisCommand *cmd;
ed9b544e 2025
3fd78bcd 2026 /* Free some memory if needed (maxmemory setting) */
2027 if (server.maxmemory) freeMemoryIfNeeded();
2028
e8a74421 2029 /* Handle the multi bulk command type. This is an alternative protocol
2030 * supported by Redis in order to receive commands that are composed of
2031 * multiple binary-safe "bulk" arguments. The latency of processing is
2032 * a bit higher but this allows things like multi-sets, so if this
2033 * protocol is used only for MSET and similar commands this is a big win. */
2034 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2035 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2036 if (c->multibulk <= 0) {
2037 resetClient(c);
2038 return 1;
2039 } else {
2040 decrRefCount(c->argv[c->argc-1]);
2041 c->argc--;
2042 return 1;
2043 }
2044 } else if (c->multibulk) {
2045 if (c->bulklen == -1) {
2046 if (((char*)c->argv[0]->ptr)[0] != '$') {
2047 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2048 resetClient(c);
2049 return 1;
2050 } else {
2051 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2052 decrRefCount(c->argv[0]);
2053 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2054 c->argc--;
2055 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2056 resetClient(c);
2057 return 1;
2058 }
2059 c->argc--;
2060 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2061 return 1;
2062 }
2063 } else {
2064 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2065 c->mbargv[c->mbargc] = c->argv[0];
2066 c->mbargc++;
2067 c->argc--;
2068 c->multibulk--;
2069 if (c->multibulk == 0) {
2070 robj **auxargv;
2071 int auxargc;
2072
2073 /* Here we need to swap the multi-bulk argc/argv with the
2074 * normal argc/argv of the client structure. */
2075 auxargv = c->argv;
2076 c->argv = c->mbargv;
2077 c->mbargv = auxargv;
2078
2079 auxargc = c->argc;
2080 c->argc = c->mbargc;
2081 c->mbargc = auxargc;
2082
2083 /* We need to set bulklen to something different than -1
2084 * in order for the code below to process the command without
2085 * to try to read the last argument of a bulk command as
2086 * a special argument. */
2087 c->bulklen = 0;
2088 /* continue below and process the command */
2089 } else {
2090 c->bulklen = -1;
2091 return 1;
2092 }
2093 }
2094 }
2095 /* -- end of multi bulk commands processing -- */
2096
ed9b544e 2097 /* The QUIT command is handled as a special case. Normal command
2098 * procs are unable to close the client connection safely */
bb0b03a3 2099 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2100 freeClient(c);
2101 return 0;
2102 }
d5d55fc3 2103
2104 /* Now lookup the command and check ASAP about trivial error conditions
2105 * such wrong arity, bad command name and so forth. */
ed9b544e 2106 cmd = lookupCommand(c->argv[0]->ptr);
2107 if (!cmd) {
2c14807b 2108 addReplySds(c,
2109 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2110 (char*)c->argv[0]->ptr));
ed9b544e 2111 resetClient(c);
2112 return 1;
2113 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2114 (c->argc < -cmd->arity)) {
454d4e43 2115 addReplySds(c,
2116 sdscatprintf(sdsempty(),
2117 "-ERR wrong number of arguments for '%s' command\r\n",
2118 cmd->name));
ed9b544e 2119 resetClient(c);
2120 return 1;
3fd78bcd 2121 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2122 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2123 resetClient(c);
2124 return 1;
ed9b544e 2125 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2126 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2127 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2128
2129 decrRefCount(c->argv[c->argc-1]);
2130 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2131 c->argc--;
2132 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2133 resetClient(c);
2134 return 1;
2135 }
2136 c->argc--;
2137 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2138 /* It is possible that the bulk read is already in the
8d0490e7 2139 * buffer. Check this condition and handle it accordingly.
2140 * This is just a fast path, alternative to call processInputBuffer().
2141 * It's a good idea since the code is small and this condition
2142 * happens most of the times. */
ed9b544e 2143 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2144 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2145 c->argc++;
2146 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2147 } else {
d5d55fc3 2148 /* Otherwise return... there is to read the last argument
2149 * from the socket. */
ed9b544e 2150 return 1;
2151 }
2152 }
10c43610 2153 /* Let's try to share objects on the command arguments vector */
2154 if (server.shareobjects) {
2155 int j;
2156 for(j = 1; j < c->argc; j++)
2157 c->argv[j] = tryObjectSharing(c->argv[j]);
2158 }
942a3961 2159 /* Let's try to encode the bulk object to save space. */
2160 if (cmd->flags & REDIS_CMD_BULK)
2161 tryObjectEncoding(c->argv[c->argc-1]);
2162
e63943a4 2163 /* Check if the user is authenticated */
2164 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2165 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2166 resetClient(c);
2167 return 1;
2168 }
2169
ed9b544e 2170 /* Exec the command */
18b6cb76 2171 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2172 queueMultiCommand(c,cmd);
2173 addReply(c,shared.queued);
2174 } else {
d5d55fc3 2175 if (server.vm_enabled && server.vm_max_threads > 0 &&
2176 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2177 call(c,cmd);
2178 }
ed9b544e 2179
2180 /* Prepare the client for the next command */
ed9b544e 2181 resetClient(c);
2182 return 1;
2183}
2184
87eca727 2185static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6208b3a7 2186 listNode *ln;
c7df85a4 2187 listIter li;
ed9b544e 2188 int outc = 0, j;
93ea3759 2189 robj **outv;
2190 /* (args*2)+1 is enough room for args, spaces, newlines */
2191 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2192
2193 if (argc <= REDIS_STATIC_ARGS) {
2194 outv = static_outv;
2195 } else {
2196 outv = zmalloc(sizeof(robj*)*(argc*2+1));
93ea3759 2197 }
ed9b544e 2198
2199 for (j = 0; j < argc; j++) {
2200 if (j != 0) outv[outc++] = shared.space;
2201 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2202 robj *lenobj;
2203
2204 lenobj = createObject(REDIS_STRING,
682ac724 2205 sdscatprintf(sdsempty(),"%lu\r\n",
83c6a618 2206 (unsigned long) stringObjectLen(argv[j])));
ed9b544e 2207 lenobj->refcount = 0;
2208 outv[outc++] = lenobj;
2209 }
2210 outv[outc++] = argv[j];
2211 }
2212 outv[outc++] = shared.crlf;
2213
40d224a9 2214 /* Increment all the refcounts at start and decrement at end in order to
2215 * be sure to free objects if there is no slave in a replication state
2216 * able to be feed with commands */
2217 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2218 listRewind(slaves,&li);
2219 while((ln = listNext(&li))) {
ed9b544e 2220 redisClient *slave = ln->value;
40d224a9 2221
2222 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2223 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2224
2225 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2226 if (slave->slaveseldb != dictid) {
2227 robj *selectcmd;
2228
2229 switch(dictid) {
2230 case 0: selectcmd = shared.select0; break;
2231 case 1: selectcmd = shared.select1; break;
2232 case 2: selectcmd = shared.select2; break;
2233 case 3: selectcmd = shared.select3; break;
2234 case 4: selectcmd = shared.select4; break;
2235 case 5: selectcmd = shared.select5; break;
2236 case 6: selectcmd = shared.select6; break;
2237 case 7: selectcmd = shared.select7; break;
2238 case 8: selectcmd = shared.select8; break;
2239 case 9: selectcmd = shared.select9; break;
2240 default:
2241 selectcmd = createObject(REDIS_STRING,
2242 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2243 selectcmd->refcount = 0;
2244 break;
2245 }
2246 addReply(slave,selectcmd);
2247 slave->slaveseldb = dictid;
2248 }
2249 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2250 }
40d224a9 2251 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2252 if (outv != static_outv) zfree(outv);
ed9b544e 2253}
2254
638e42ac 2255static void processInputBuffer(redisClient *c) {
ed9b544e 2256again:
4409877e 2257 /* Before to process the input buffer, make sure the client is not
2258 * waitig for a blocking operation such as BLPOP. Note that the first
2259 * iteration the client is never blocked, otherwise the processInputBuffer
2260 * would not be called at all, but after the execution of the first commands
2261 * in the input buffer the client may be blocked, and the "goto again"
2262 * will try to reiterate. The following line will make it return asap. */
92f8e882 2263 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2264 if (c->bulklen == -1) {
2265 /* Read the first line of the query */
2266 char *p = strchr(c->querybuf,'\n');
2267 size_t querylen;
644fafa3 2268
ed9b544e 2269 if (p) {
2270 sds query, *argv;
2271 int argc, j;
2272
2273 query = c->querybuf;
2274 c->querybuf = sdsempty();
2275 querylen = 1+(p-(query));
2276 if (sdslen(query) > querylen) {
2277 /* leave data after the first line of the query in the buffer */
2278 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2279 }
2280 *p = '\0'; /* remove "\n" */
2281 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2282 sdsupdatelen(query);
2283
2284 /* Now we can split the query in arguments */
ed9b544e 2285 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2286 sdsfree(query);
2287
2288 if (c->argv) zfree(c->argv);
2289 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2290
2291 for (j = 0; j < argc; j++) {
ed9b544e 2292 if (sdslen(argv[j])) {
2293 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2294 c->argc++;
2295 } else {
2296 sdsfree(argv[j]);
2297 }
2298 }
2299 zfree(argv);
7c49733c 2300 if (c->argc) {
2301 /* Execute the command. If the client is still valid
2302 * after processCommand() return and there is something
2303 * on the query buffer try to process the next command. */
2304 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2305 } else {
2306 /* Nothing to process, argc == 0. Just process the query
2307 * buffer if it's not empty or return to the caller */
2308 if (sdslen(c->querybuf)) goto again;
2309 }
ed9b544e 2310 return;
644fafa3 2311 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2312 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2313 freeClient(c);
2314 return;
2315 }
2316 } else {
2317 /* Bulk read handling. Note that if we are at this point
2318 the client already sent a command terminated with a newline,
2319 we are reading the bulk data that is actually the last
2320 argument of the command. */
2321 int qbl = sdslen(c->querybuf);
2322
2323 if (c->bulklen <= qbl) {
2324 /* Copy everything but the final CRLF as final argument */
2325 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2326 c->argc++;
2327 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2328 /* Process the command. If the client is still valid after
2329 * the processing and there is more data in the buffer
2330 * try to parse it. */
2331 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2332 return;
2333 }
2334 }
2335}
2336
638e42ac 2337static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2338 redisClient *c = (redisClient*) privdata;
2339 char buf[REDIS_IOBUF_LEN];
2340 int nread;
2341 REDIS_NOTUSED(el);
2342 REDIS_NOTUSED(mask);
2343
2344 nread = read(fd, buf, REDIS_IOBUF_LEN);
2345 if (nread == -1) {
2346 if (errno == EAGAIN) {
2347 nread = 0;
2348 } else {
f870935d 2349 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2350 freeClient(c);
2351 return;
2352 }
2353 } else if (nread == 0) {
f870935d 2354 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2355 freeClient(c);
2356 return;
2357 }
2358 if (nread) {
2359 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2360 c->lastinteraction = time(NULL);
2361 } else {
2362 return;
2363 }
5921aa36 2364 if (!(c->flags & REDIS_BLOCKED))
2365 processInputBuffer(c);
638e42ac 2366}
2367
ed9b544e 2368static int selectDb(redisClient *c, int id) {
2369 if (id < 0 || id >= server.dbnum)
2370 return REDIS_ERR;
3305306f 2371 c->db = &server.db[id];
ed9b544e 2372 return REDIS_OK;
2373}
2374
40d224a9 2375static void *dupClientReplyValue(void *o) {
2376 incrRefCount((robj*)o);
12d090d2 2377 return o;
40d224a9 2378}
2379
ed9b544e 2380static redisClient *createClient(int fd) {
2381 redisClient *c = zmalloc(sizeof(*c));
2382
2383 anetNonBlock(NULL,fd);
2384 anetTcpNoDelay(NULL,fd);
2385 if (!c) return NULL;
2386 selectDb(c,0);
2387 c->fd = fd;
2388 c->querybuf = sdsempty();
2389 c->argc = 0;
93ea3759 2390 c->argv = NULL;
ed9b544e 2391 c->bulklen = -1;
e8a74421 2392 c->multibulk = 0;
2393 c->mbargc = 0;
2394 c->mbargv = NULL;
ed9b544e 2395 c->sentlen = 0;
2396 c->flags = 0;
2397 c->lastinteraction = time(NULL);
abcb223e 2398 c->authenticated = 0;
40d224a9 2399 c->replstate = REDIS_REPL_NONE;
6b47e12e 2400 c->reply = listCreate();
ed9b544e 2401 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2402 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2403 c->blockingkeys = NULL;
2404 c->blockingkeysnum = 0;
2405 c->io_keys = listCreate();
2406 listSetFreeMethod(c->io_keys,decrRefCount);
ed9b544e 2407 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2408 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2409 freeClient(c);
2410 return NULL;
2411 }
6b47e12e 2412 listAddNodeTail(server.clients,c);
6e469882 2413 initClientMultiState(c);
ed9b544e 2414 return c;
2415}
2416
2417static void addReply(redisClient *c, robj *obj) {
2418 if (listLength(c->reply) == 0 &&
6208b3a7 2419 (c->replstate == REDIS_REPL_NONE ||
2420 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2421 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2422 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2423
2424 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2425 obj = dupStringObject(obj);
2426 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2427 }
9d65a1bb 2428 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2429}
2430
2431static void addReplySds(redisClient *c, sds s) {
2432 robj *o = createObject(REDIS_STRING,s);
2433 addReply(c,o);
2434 decrRefCount(o);
2435}
2436
e2665397 2437static void addReplyDouble(redisClient *c, double d) {
2438 char buf[128];
2439
2440 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2441 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2442 (unsigned long) strlen(buf),buf));
e2665397 2443}
2444
f44dd428 2445static void addReplyLong(redisClient *c, long l) {
2446 char buf[128];
2447 size_t len;
2448
2449 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2450 addReplySds(c,sdsnewlen(buf,len));
2451}
2452
942a3961 2453static void addReplyBulkLen(redisClient *c, robj *obj) {
2454 size_t len;
2455
2456 if (obj->encoding == REDIS_ENCODING_RAW) {
2457 len = sdslen(obj->ptr);
2458 } else {
2459 long n = (long)obj->ptr;
2460
e054afda 2461 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2462 len = 1;
2463 if (n < 0) {
2464 len++;
2465 n = -n;
2466 }
2467 while((n = n/10) != 0) {
2468 len++;
2469 }
2470 }
83c6a618 2471 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2472}
2473
ed9b544e 2474static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2475 int cport, cfd;
2476 char cip[128];
285add55 2477 redisClient *c;
ed9b544e 2478 REDIS_NOTUSED(el);
2479 REDIS_NOTUSED(mask);
2480 REDIS_NOTUSED(privdata);
2481
2482 cfd = anetAccept(server.neterr, fd, cip, &cport);
2483 if (cfd == AE_ERR) {
f870935d 2484 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2485 return;
2486 }
f870935d 2487 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2488 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2489 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2490 close(cfd); /* May be already closed, just ingore errors */
2491 return;
2492 }
285add55 2493 /* If maxclient directive is set and this is one client more... close the
2494 * connection. Note that we create the client instead to check before
2495 * for this condition, since now the socket is already set in nonblocking
2496 * mode and we can send an error for free using the Kernel I/O */
2497 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2498 char *err = "-ERR max number of clients reached\r\n";
2499
2500 /* That's a best effort error message, don't check write errors */
fee803ba 2501 if (write(c->fd,err,strlen(err)) == -1) {
2502 /* Nothing to do, Just to avoid the warning... */
2503 }
285add55 2504 freeClient(c);
2505 return;
2506 }
ed9b544e 2507 server.stat_numconnections++;
2508}
2509
2510/* ======================= Redis objects implementation ===================== */
2511
2512static robj *createObject(int type, void *ptr) {
2513 robj *o;
2514
a5819310 2515 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2516 if (listLength(server.objfreelist)) {
2517 listNode *head = listFirst(server.objfreelist);
2518 o = listNodeValue(head);
2519 listDelNode(server.objfreelist,head);
a5819310 2520 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2521 } else {
75680a3c 2522 if (server.vm_enabled) {
a5819310 2523 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2524 o = zmalloc(sizeof(*o));
2525 } else {
2526 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2527 }
ed9b544e 2528 }
ed9b544e 2529 o->type = type;
942a3961 2530 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2531 o->ptr = ptr;
2532 o->refcount = 1;
3a66edc7 2533 if (server.vm_enabled) {
1064ef87 2534 /* Note that this code may run in the context of an I/O thread
2535 * and accessing to server.unixtime in theory is an error
2536 * (no locks). But in practice this is safe, and even if we read
2537 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2538 o->vm.atime = server.unixtime;
2539 o->storage = REDIS_VM_MEMORY;
2540 }
ed9b544e 2541 return o;
2542}
2543
2544static robj *createStringObject(char *ptr, size_t len) {
2545 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2546}
2547
4ef8de8a 2548static robj *dupStringObject(robj *o) {
b9bc0eef 2549 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2550 return createStringObject(o->ptr,sdslen(o->ptr));
2551}
2552
ed9b544e 2553static robj *createListObject(void) {
2554 list *l = listCreate();
2555
ed9b544e 2556 listSetFreeMethod(l,decrRefCount);
2557 return createObject(REDIS_LIST,l);
2558}
2559
2560static robj *createSetObject(void) {
2561 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2562 return createObject(REDIS_SET,d);
2563}
2564
5234952b 2565static robj *createHashObject(void) {
2566 /* All the Hashes start as zipmaps. Will be automatically converted
2567 * into hash tables if there are enough elements or big elements
2568 * inside. */
2569 unsigned char *zm = zipmapNew();
2570 robj *o = createObject(REDIS_HASH,zm);
2571 o->encoding = REDIS_ENCODING_ZIPMAP;
2572 return o;
2573}
2574
1812e024 2575static robj *createZsetObject(void) {
6b47e12e 2576 zset *zs = zmalloc(sizeof(*zs));
2577
2578 zs->dict = dictCreate(&zsetDictType,NULL);
2579 zs->zsl = zslCreate();
2580 return createObject(REDIS_ZSET,zs);
1812e024 2581}
2582
ed9b544e 2583static void freeStringObject(robj *o) {
942a3961 2584 if (o->encoding == REDIS_ENCODING_RAW) {
2585 sdsfree(o->ptr);
2586 }
ed9b544e 2587}
2588
2589static void freeListObject(robj *o) {
2590 listRelease((list*) o->ptr);
2591}
2592
2593static void freeSetObject(robj *o) {
2594 dictRelease((dict*) o->ptr);
2595}
2596
fd8ccf44 2597static void freeZsetObject(robj *o) {
2598 zset *zs = o->ptr;
2599
2600 dictRelease(zs->dict);
2601 zslFree(zs->zsl);
2602 zfree(zs);
2603}
2604
ed9b544e 2605static void freeHashObject(robj *o) {
2606 dictRelease((dict*) o->ptr);
2607}
2608
2609static void incrRefCount(robj *o) {
f2b8ab34 2610 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
ed9b544e 2611 o->refcount++;
2612}
2613
2614static void decrRefCount(void *obj) {
2615 robj *o = obj;
94754ccc 2616
970e10bb 2617 /* Object is a key of a swapped out value, or in the process of being
2618 * loaded. */
996cb5f7 2619 if (server.vm_enabled &&
2620 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2621 {
2622 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2623 redisAssert(o->refcount == 1);
2624 }
2625 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2626 redisAssert(o->type == REDIS_STRING);
a35ddf12 2627 freeStringObject(o);
2628 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2629 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2630 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2631 !listAddNodeHead(server.objfreelist,o))
2632 zfree(o);
a5819310 2633 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2634 server.vm_stats_swapped_objects--;
a35ddf12 2635 return;
2636 }
996cb5f7 2637 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2638 if (--(o->refcount) == 0) {
996cb5f7 2639 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2640 vmCancelThreadedIOJob(obj);
ed9b544e 2641 switch(o->type) {
2642 case REDIS_STRING: freeStringObject(o); break;
2643 case REDIS_LIST: freeListObject(o); break;
2644 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2645 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2646 case REDIS_HASH: freeHashObject(o); break;
dfc5e96c 2647 default: redisAssert(0 != 0); break;
ed9b544e 2648 }
a5819310 2649 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2650 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2651 !listAddNodeHead(server.objfreelist,o))
2652 zfree(o);
a5819310 2653 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2654 }
2655}
2656
942a3961 2657static robj *lookupKey(redisDb *db, robj *key) {
2658 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2659 if (de) {
55cf8433 2660 robj *key = dictGetEntryKey(de);
2661 robj *val = dictGetEntryVal(de);
3a66edc7 2662
55cf8433 2663 if (server.vm_enabled) {
996cb5f7 2664 if (key->storage == REDIS_VM_MEMORY ||
2665 key->storage == REDIS_VM_SWAPPING)
2666 {
2667 /* If we were swapping the object out, stop it, this key
2668 * was requested. */
2669 if (key->storage == REDIS_VM_SWAPPING)
2670 vmCancelThreadedIOJob(key);
55cf8433 2671 /* Update the access time of the key for the aging algorithm. */
2672 key->vm.atime = server.unixtime;
2673 } else {
d5d55fc3 2674 int notify = (key->storage == REDIS_VM_LOADING);
2675
55cf8433 2676 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2677 redisAssert(val == NULL);
55cf8433 2678 val = vmLoadObject(key);
2679 dictGetEntryVal(de) = val;
d5d55fc3 2680
2681 /* Clients blocked by the VM subsystem may be waiting for
2682 * this key... */
2683 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 2684 }
2685 }
2686 return val;
3a66edc7 2687 } else {
2688 return NULL;
2689 }
942a3961 2690}
2691
2692static robj *lookupKeyRead(redisDb *db, robj *key) {
2693 expireIfNeeded(db,key);
2694 return lookupKey(db,key);
2695}
2696
2697static robj *lookupKeyWrite(redisDb *db, robj *key) {
2698 deleteIfVolatile(db,key);
2699 return lookupKey(db,key);
2700}
2701
2702static int deleteKey(redisDb *db, robj *key) {
2703 int retval;
2704
2705 /* We need to protect key from destruction: after the first dictDelete()
2706 * it may happen that 'key' is no longer valid if we don't increment
2707 * it's count. This may happen when we get the object reference directly
2708 * from the hash table with dictRandomKey() or dict iterators */
2709 incrRefCount(key);
2710 if (dictSize(db->expires)) dictDelete(db->expires,key);
2711 retval = dictDelete(db->dict,key);
2712 decrRefCount(key);
2713
2714 return retval == DICT_OK;
2715}
2716
10c43610 2717/* Try to share an object against the shared objects pool */
2718static robj *tryObjectSharing(robj *o) {
2719 struct dictEntry *de;
2720 unsigned long c;
2721
3305306f 2722 if (o == NULL || server.shareobjects == 0) return o;
10c43610 2723
dfc5e96c 2724 redisAssert(o->type == REDIS_STRING);
10c43610 2725 de = dictFind(server.sharingpool,o);
2726 if (de) {
2727 robj *shared = dictGetEntryKey(de);
2728
2729 c = ((unsigned long) dictGetEntryVal(de))+1;
2730 dictGetEntryVal(de) = (void*) c;
2731 incrRefCount(shared);
2732 decrRefCount(o);
2733 return shared;
2734 } else {
2735 /* Here we are using a stream algorihtm: Every time an object is
2736 * shared we increment its count, everytime there is a miss we
2737 * recrement the counter of a random object. If this object reaches
2738 * zero we remove the object and put the current object instead. */
3305306f 2739 if (dictSize(server.sharingpool) >=
10c43610 2740 server.sharingpoolsize) {
2741 de = dictGetRandomKey(server.sharingpool);
dfc5e96c 2742 redisAssert(de != NULL);
10c43610 2743 c = ((unsigned long) dictGetEntryVal(de))-1;
2744 dictGetEntryVal(de) = (void*) c;
2745 if (c == 0) {
2746 dictDelete(server.sharingpool,de->key);
2747 }
2748 } else {
2749 c = 0; /* If the pool is empty we want to add this object */
2750 }
2751 if (c == 0) {
2752 int retval;
2753
2754 retval = dictAdd(server.sharingpool,o,(void*)1);
dfc5e96c 2755 redisAssert(retval == DICT_OK);
10c43610 2756 incrRefCount(o);
2757 }
2758 return o;
2759 }
2760}
2761
724a51b1 2762/* Check if the nul-terminated string 's' can be represented by a long
2763 * (that is, is a number that fits into long without any other space or
2764 * character before or after the digits).
2765 *
2766 * If so, the function returns REDIS_OK and *longval is set to the value
2767 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2768static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2769 char buf[32], *endptr;
2770 long value;
2771 int slen;
2772
2773 value = strtol(s, &endptr, 10);
2774 if (endptr[0] != '\0') return REDIS_ERR;
2775 slen = snprintf(buf,32,"%ld",value);
2776
2777 /* If the number converted back into a string is not identical
2778 * then it's not possible to encode the string as integer */
f69f2cba 2779 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2780 if (longval) *longval = value;
2781 return REDIS_OK;
2782}
2783
942a3961 2784/* Try to encode a string object in order to save space */
2785static int tryObjectEncoding(robj *o) {
2786 long value;
942a3961 2787 sds s = o->ptr;
3305306f 2788
942a3961 2789 if (o->encoding != REDIS_ENCODING_RAW)
2790 return REDIS_ERR; /* Already encoded */
3305306f 2791
942a3961 2792 /* It's not save to encode shared objects: shared objects can be shared
2793 * everywhere in the "object space" of Redis. Encoded objects can only
2794 * appear as "values" (and not, for instance, as keys) */
2795 if (o->refcount > 1) return REDIS_ERR;
3305306f 2796
942a3961 2797 /* Currently we try to encode only strings */
dfc5e96c 2798 redisAssert(o->type == REDIS_STRING);
94754ccc 2799
724a51b1 2800 /* Check if we can represent this string as a long integer */
2801 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
942a3961 2802
2803 /* Ok, this object can be encoded */
2804 o->encoding = REDIS_ENCODING_INT;
2805 sdsfree(o->ptr);
2806 o->ptr = (void*) value;
2807 return REDIS_OK;
2808}
2809
9d65a1bb 2810/* Get a decoded version of an encoded object (returned as a new object).
2811 * If the object is already raw-encoded just increment the ref count. */
2812static robj *getDecodedObject(robj *o) {
942a3961 2813 robj *dec;
2814
9d65a1bb 2815 if (o->encoding == REDIS_ENCODING_RAW) {
2816 incrRefCount(o);
2817 return o;
2818 }
942a3961 2819 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2820 char buf[32];
2821
2822 snprintf(buf,32,"%ld",(long)o->ptr);
2823 dec = createStringObject(buf,strlen(buf));
2824 return dec;
2825 } else {
dfc5e96c 2826 redisAssert(1 != 1);
942a3961 2827 }
3305306f 2828}
2829
d7f43c08 2830/* Compare two string objects via strcmp() or alike.
2831 * Note that the objects may be integer-encoded. In such a case we
2832 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 2833 * and compare the strings, it's much faster than calling getDecodedObject().
2834 *
2835 * Important note: if objects are not integer encoded, but binary-safe strings,
2836 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2837 * binary safe. */
724a51b1 2838static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 2839 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 2840 char bufa[128], bufb[128], *astr, *bstr;
2841 int bothsds = 1;
724a51b1 2842
e197b441 2843 if (a == b) return 0;
d7f43c08 2844 if (a->encoding != REDIS_ENCODING_RAW) {
2845 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2846 astr = bufa;
2847 bothsds = 0;
724a51b1 2848 } else {
d7f43c08 2849 astr = a->ptr;
724a51b1 2850 }
d7f43c08 2851 if (b->encoding != REDIS_ENCODING_RAW) {
2852 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2853 bstr = bufb;
2854 bothsds = 0;
2855 } else {
2856 bstr = b->ptr;
2857 }
2858 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 2859}
2860
0ea663ea 2861static size_t stringObjectLen(robj *o) {
dfc5e96c 2862 redisAssert(o->type == REDIS_STRING);
0ea663ea 2863 if (o->encoding == REDIS_ENCODING_RAW) {
2864 return sdslen(o->ptr);
2865 } else {
2866 char buf[32];
2867
2868 return snprintf(buf,32,"%ld",(long)o->ptr);
2869 }
2870}
2871
06233c45 2872/*============================ RDB saving/loading =========================== */
ed9b544e 2873
f78fd11b 2874static int rdbSaveType(FILE *fp, unsigned char type) {
2875 if (fwrite(&type,1,1,fp) == 0) return -1;
2876 return 0;
2877}
2878
bb32ede5 2879static int rdbSaveTime(FILE *fp, time_t t) {
2880 int32_t t32 = (int32_t) t;
2881 if (fwrite(&t32,4,1,fp) == 0) return -1;
2882 return 0;
2883}
2884
e3566d4b 2885/* check rdbLoadLen() comments for more info */
f78fd11b 2886static int rdbSaveLen(FILE *fp, uint32_t len) {
2887 unsigned char buf[2];
2888
2889 if (len < (1<<6)) {
2890 /* Save a 6 bit len */
10c43610 2891 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 2892 if (fwrite(buf,1,1,fp) == 0) return -1;
2893 } else if (len < (1<<14)) {
2894 /* Save a 14 bit len */
10c43610 2895 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 2896 buf[1] = len&0xFF;
17be1a4a 2897 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 2898 } else {
2899 /* Save a 32 bit len */
10c43610 2900 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 2901 if (fwrite(buf,1,1,fp) == 0) return -1;
2902 len = htonl(len);
2903 if (fwrite(&len,4,1,fp) == 0) return -1;
2904 }
2905 return 0;
2906}
2907
e3566d4b 2908/* String objects in the form "2391" "-100" without any space and with a
2909 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2910 * encoded as integers to save space */
56906eef 2911static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
e3566d4b 2912 long long value;
2913 char *endptr, buf[32];
2914
2915 /* Check if it's possible to encode this value as a number */
2916 value = strtoll(s, &endptr, 10);
2917 if (endptr[0] != '\0') return 0;
2918 snprintf(buf,32,"%lld",value);
2919
2920 /* If the number converted back into a string is not identical
2921 * then it's not possible to encode the string as integer */
2922 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2923
2924 /* Finally check if it fits in our ranges */
2925 if (value >= -(1<<7) && value <= (1<<7)-1) {
2926 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2927 enc[1] = value&0xFF;
2928 return 2;
2929 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2930 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2931 enc[1] = value&0xFF;
2932 enc[2] = (value>>8)&0xFF;
2933 return 3;
2934 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2935 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2936 enc[1] = value&0xFF;
2937 enc[2] = (value>>8)&0xFF;
2938 enc[3] = (value>>16)&0xFF;
2939 enc[4] = (value>>24)&0xFF;
2940 return 5;
2941 } else {
2942 return 0;
2943 }
2944}
2945
774e3047 2946static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2947 unsigned int comprlen, outlen;
2948 unsigned char byte;
2949 void *out;
2950
2951 /* We require at least four bytes compression for this to be worth it */
2952 outlen = sdslen(obj->ptr)-4;
2953 if (outlen <= 0) return 0;
3a2694c4 2954 if ((out = zmalloc(outlen+1)) == NULL) return 0;
774e3047 2955 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2956 if (comprlen == 0) {
88e85998 2957 zfree(out);
774e3047 2958 return 0;
2959 }
2960 /* Data compressed! Let's save it on disk */
2961 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2962 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2963 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2964 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2965 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 2966 zfree(out);
774e3047 2967 return comprlen;
2968
2969writeerr:
88e85998 2970 zfree(out);
774e3047 2971 return -1;
2972}
2973
e3566d4b 2974/* Save a string objet as [len][data] on disk. If the object is a string
2975 * representation of an integer value we try to safe it in a special form */
942a3961 2976static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
2977 size_t len;
e3566d4b 2978 int enclen;
10c43610 2979
942a3961 2980 len = sdslen(obj->ptr);
2981
774e3047 2982 /* Try integer encoding */
e3566d4b 2983 if (len <= 11) {
2984 unsigned char buf[5];
2985 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
2986 if (fwrite(buf,enclen,1,fp) == 0) return -1;
2987 return 0;
2988 }
2989 }
774e3047 2990
2991 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 2992 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 2993 if (server.rdbcompression && len > 20) {
774e3047 2994 int retval;
2995
2996 retval = rdbSaveLzfStringObject(fp,obj);
2997 if (retval == -1) return -1;
2998 if (retval > 0) return 0;
2999 /* retval == 0 means data can't be compressed, save the old way */
3000 }
3001
3002 /* Store verbatim */
10c43610 3003 if (rdbSaveLen(fp,len) == -1) return -1;
3004 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
3005 return 0;
3006}
3007
942a3961 3008/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3009static int rdbSaveStringObject(FILE *fp, robj *obj) {
3010 int retval;
942a3961 3011
f2d9f50f 3012 /* Avoid incr/decr ref count business when possible.
3013 * This plays well with copy-on-write given that we are probably
3014 * in a child process (BGSAVE). Also this makes sure key objects
3015 * of swapped objects are not incRefCount-ed (an assert does not allow
3016 * this in order to avoid bugs) */
3017 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3018 obj = getDecodedObject(obj);
3019 retval = rdbSaveStringObjectRaw(fp,obj);
3020 decrRefCount(obj);
3021 } else {
996cb5f7 3022 retval = rdbSaveStringObjectRaw(fp,obj);
3023 }
9d65a1bb 3024 return retval;
942a3961 3025}
3026
a7866db6 3027/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3028 * 8 bit integer specifing the length of the representation.
3029 * This 8 bit integer has special values in order to specify the following
3030 * conditions:
3031 * 253: not a number
3032 * 254: + inf
3033 * 255: - inf
3034 */
3035static int rdbSaveDoubleValue(FILE *fp, double val) {
3036 unsigned char buf[128];
3037 int len;
3038
3039 if (isnan(val)) {
3040 buf[0] = 253;
3041 len = 1;
3042 } else if (!isfinite(val)) {
3043 len = 1;
3044 buf[0] = (val < 0) ? 255 : 254;
3045 } else {
eaa256ad 3046 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3047 buf[0] = strlen((char*)buf+1);
a7866db6 3048 len = buf[0]+1;
3049 }
3050 if (fwrite(buf,len,1,fp) == 0) return -1;
3051 return 0;
3052}
3053
06233c45 3054/* Save a Redis object. */
3055static int rdbSaveObject(FILE *fp, robj *o) {
3056 if (o->type == REDIS_STRING) {
3057 /* Save a string value */
3058 if (rdbSaveStringObject(fp,o) == -1) return -1;
3059 } else if (o->type == REDIS_LIST) {
3060 /* Save a list value */
3061 list *list = o->ptr;
c7df85a4 3062 listIter li;
06233c45 3063 listNode *ln;
3064
06233c45 3065 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3066 listRewind(list,&li);
3067 while((ln = listNext(&li))) {
06233c45 3068 robj *eleobj = listNodeValue(ln);
3069
3070 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3071 }
3072 } else if (o->type == REDIS_SET) {
3073 /* Save a set value */
3074 dict *set = o->ptr;
3075 dictIterator *di = dictGetIterator(set);
3076 dictEntry *de;
3077
3078 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3079 while((de = dictNext(di)) != NULL) {
3080 robj *eleobj = dictGetEntryKey(de);
3081
3082 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3083 }
3084 dictReleaseIterator(di);
3085 } else if (o->type == REDIS_ZSET) {
3086 /* Save a set value */
3087 zset *zs = o->ptr;
3088 dictIterator *di = dictGetIterator(zs->dict);
3089 dictEntry *de;
3090
3091 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3092 while((de = dictNext(di)) != NULL) {
3093 robj *eleobj = dictGetEntryKey(de);
3094 double *score = dictGetEntryVal(de);
3095
3096 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3097 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3098 }
3099 dictReleaseIterator(di);
3100 } else {
3101 redisAssert(0 != 0);
3102 }
3103 return 0;
3104}
3105
3106/* Return the length the object will have on disk if saved with
3107 * the rdbSaveObject() function. Currently we use a trick to get
3108 * this length with very little changes to the code. In the future
3109 * we could switch to a faster solution. */
b9bc0eef 3110static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3111 if (fp == NULL) fp = server.devnull;
06233c45 3112 rewind(fp);
3113 assert(rdbSaveObject(fp,o) != 1);
3114 return ftello(fp);
3115}
3116
06224fec 3117/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3118static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3119 off_t bytes = rdbSavedObjectLen(o,fp);
06224fec 3120
3121 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3122}
3123
ed9b544e 3124/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3125static int rdbSave(char *filename) {
ed9b544e 3126 dictIterator *di = NULL;
3127 dictEntry *de;
ed9b544e 3128 FILE *fp;
3129 char tmpfile[256];
3130 int j;
bb32ede5 3131 time_t now = time(NULL);
ed9b544e 3132
2316bb3b 3133 /* Wait for I/O therads to terminate, just in case this is a
3134 * foreground-saving, to avoid seeking the swap file descriptor at the
3135 * same time. */
3136 if (server.vm_enabled)
3137 waitEmptyIOJobsQueue();
3138
a3b21203 3139 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3140 fp = fopen(tmpfile,"w");
3141 if (!fp) {
3142 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3143 return REDIS_ERR;
3144 }
f78fd11b 3145 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3146 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3147 redisDb *db = server.db+j;
3148 dict *d = db->dict;
3305306f 3149 if (dictSize(d) == 0) continue;
ed9b544e 3150 di = dictGetIterator(d);
3151 if (!di) {
3152 fclose(fp);
3153 return REDIS_ERR;
3154 }
3155
3156 /* Write the SELECT DB opcode */
f78fd11b 3157 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3158 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3159
3160 /* Iterate this DB writing every entry */
3161 while((de = dictNext(di)) != NULL) {
3162 robj *key = dictGetEntryKey(de);
3163 robj *o = dictGetEntryVal(de);
bb32ede5 3164 time_t expiretime = getExpire(db,key);
3165
3166 /* Save the expire time */
3167 if (expiretime != -1) {
3168 /* If this key is already expired skip it */
3169 if (expiretime < now) continue;
3170 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3171 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3172 }
7e69548d 3173 /* Save the key and associated value. This requires special
3174 * handling if the value is swapped out. */
996cb5f7 3175 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3176 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3177 /* Save type, key, value */
3178 if (rdbSaveType(fp,o->type) == -1) goto werr;
3179 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3180 if (rdbSaveObject(fp,o) == -1) goto werr;
3181 } else {
996cb5f7 3182 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3183 robj *po;
7e69548d 3184 /* Get a preview of the object in memory */
3185 po = vmPreviewObject(key);
7e69548d 3186 /* Save type, key, value */
3187 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3188 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3189 if (rdbSaveObject(fp,po) == -1) goto werr;
3190 /* Remove the loaded object from memory */
3191 decrRefCount(po);
7e69548d 3192 }
ed9b544e 3193 }
3194 dictReleaseIterator(di);
3195 }
3196 /* EOF opcode */
f78fd11b 3197 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3198
3199 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3200 fflush(fp);
3201 fsync(fileno(fp));
3202 fclose(fp);
3203
3204 /* Use RENAME to make sure the DB file is changed atomically only
3205 * if the generate DB file is ok. */
3206 if (rename(tmpfile,filename) == -1) {
325d1eb4 3207 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3208 unlink(tmpfile);
3209 return REDIS_ERR;
3210 }
3211 redisLog(REDIS_NOTICE,"DB saved on disk");
3212 server.dirty = 0;
3213 server.lastsave = time(NULL);
3214 return REDIS_OK;
3215
3216werr:
3217 fclose(fp);
3218 unlink(tmpfile);
3219 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3220 if (di) dictReleaseIterator(di);
3221 return REDIS_ERR;
3222}
3223
f78fd11b 3224static int rdbSaveBackground(char *filename) {
ed9b544e 3225 pid_t childpid;
3226
9d65a1bb 3227 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3228 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3229 if ((childpid = fork()) == 0) {
3230 /* Child */
054e426d 3231 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3232 close(server.fd);
f78fd11b 3233 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3234 _exit(0);
ed9b544e 3235 } else {
478c2c6f 3236 _exit(1);
ed9b544e 3237 }
3238 } else {
3239 /* Parent */
5a7c647e 3240 if (childpid == -1) {
3241 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3242 strerror(errno));
3243 return REDIS_ERR;
3244 }
ed9b544e 3245 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3246 server.bgsavechildpid = childpid;
ed9b544e 3247 return REDIS_OK;
3248 }
3249 return REDIS_OK; /* unreached */
3250}
3251
a3b21203 3252static void rdbRemoveTempFile(pid_t childpid) {
3253 char tmpfile[256];
3254
3255 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3256 unlink(tmpfile);
3257}
3258
f78fd11b 3259static int rdbLoadType(FILE *fp) {
3260 unsigned char type;
7b45bfb2 3261 if (fread(&type,1,1,fp) == 0) return -1;
3262 return type;
3263}
3264
bb32ede5 3265static time_t rdbLoadTime(FILE *fp) {
3266 int32_t t32;
3267 if (fread(&t32,4,1,fp) == 0) return -1;
3268 return (time_t) t32;
3269}
3270
e3566d4b 3271/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3272 * of this file for a description of how this are stored on disk.
3273 *
3274 * isencoded is set to 1 if the readed length is not actually a length but
3275 * an "encoding type", check the above comments for more info */
c78a8ccc 3276static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3277 unsigned char buf[2];
3278 uint32_t len;
c78a8ccc 3279 int type;
f78fd11b 3280
e3566d4b 3281 if (isencoded) *isencoded = 0;
c78a8ccc 3282 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3283 type = (buf[0]&0xC0)>>6;
3284 if (type == REDIS_RDB_6BITLEN) {
3285 /* Read a 6 bit len */
3286 return buf[0]&0x3F;
3287 } else if (type == REDIS_RDB_ENCVAL) {
3288 /* Read a 6 bit len encoding type */
3289 if (isencoded) *isencoded = 1;
3290 return buf[0]&0x3F;
3291 } else if (type == REDIS_RDB_14BITLEN) {
3292 /* Read a 14 bit len */
3293 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3294 return ((buf[0]&0x3F)<<8)|buf[1];
3295 } else {
3296 /* Read a 32 bit len */
f78fd11b 3297 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3298 return ntohl(len);
f78fd11b 3299 }
f78fd11b 3300}
3301
e3566d4b 3302static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3303 unsigned char enc[4];
3304 long long val;
3305
3306 if (enctype == REDIS_RDB_ENC_INT8) {
3307 if (fread(enc,1,1,fp) == 0) return NULL;
3308 val = (signed char)enc[0];
3309 } else if (enctype == REDIS_RDB_ENC_INT16) {
3310 uint16_t v;
3311 if (fread(enc,2,1,fp) == 0) return NULL;
3312 v = enc[0]|(enc[1]<<8);
3313 val = (int16_t)v;
3314 } else if (enctype == REDIS_RDB_ENC_INT32) {
3315 uint32_t v;
3316 if (fread(enc,4,1,fp) == 0) return NULL;
3317 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3318 val = (int32_t)v;
3319 } else {
3320 val = 0; /* anti-warning */
dfc5e96c 3321 redisAssert(0!=0);
e3566d4b 3322 }
3323 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3324}
3325
c78a8ccc 3326static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3327 unsigned int len, clen;
3328 unsigned char *c = NULL;
3329 sds val = NULL;
3330
c78a8ccc 3331 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3332 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3333 if ((c = zmalloc(clen)) == NULL) goto err;
3334 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3335 if (fread(c,clen,1,fp) == 0) goto err;
3336 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3337 zfree(c);
88e85998 3338 return createObject(REDIS_STRING,val);
3339err:
3340 zfree(c);
3341 sdsfree(val);
3342 return NULL;
3343}
3344
c78a8ccc 3345static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3346 int isencoded;
3347 uint32_t len;
f78fd11b 3348 sds val;
3349
c78a8ccc 3350 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3351 if (isencoded) {
3352 switch(len) {
3353 case REDIS_RDB_ENC_INT8:
3354 case REDIS_RDB_ENC_INT16:
3355 case REDIS_RDB_ENC_INT32:
3305306f 3356 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
88e85998 3357 case REDIS_RDB_ENC_LZF:
c78a8ccc 3358 return tryObjectSharing(rdbLoadLzfStringObject(fp));
e3566d4b 3359 default:
dfc5e96c 3360 redisAssert(0!=0);
e3566d4b 3361 }
3362 }
3363
f78fd11b 3364 if (len == REDIS_RDB_LENERR) return NULL;
3365 val = sdsnewlen(NULL,len);
3366 if (len && fread(val,len,1,fp) == 0) {
3367 sdsfree(val);
3368 return NULL;
3369 }
10c43610 3370 return tryObjectSharing(createObject(REDIS_STRING,val));
f78fd11b 3371}
3372
a7866db6 3373/* For information about double serialization check rdbSaveDoubleValue() */
3374static int rdbLoadDoubleValue(FILE *fp, double *val) {
3375 char buf[128];
3376 unsigned char len;
3377
3378 if (fread(&len,1,1,fp) == 0) return -1;
3379 switch(len) {
3380 case 255: *val = R_NegInf; return 0;
3381 case 254: *val = R_PosInf; return 0;
3382 case 253: *val = R_Nan; return 0;
3383 default:
3384 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3385 buf[len] = '\0';
a7866db6 3386 sscanf(buf, "%lg", val);
3387 return 0;
3388 }
3389}
3390
c78a8ccc 3391/* Load a Redis object of the specified type from the specified file.
3392 * On success a newly allocated object is returned, otherwise NULL. */
3393static robj *rdbLoadObject(int type, FILE *fp) {
3394 robj *o;
3395
3396 if (type == REDIS_STRING) {
3397 /* Read string value */
3398 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3399 tryObjectEncoding(o);
3400 } else if (type == REDIS_LIST || type == REDIS_SET) {
3401 /* Read list/set value */
3402 uint32_t listlen;
3403
3404 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3405 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3406 /* It's faster to expand the dict to the right size asap in order
3407 * to avoid rehashing */
3408 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3409 dictExpand(o->ptr,listlen);
c78a8ccc 3410 /* Load every single element of the list/set */
3411 while(listlen--) {
3412 robj *ele;
3413
3414 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3415 tryObjectEncoding(ele);
3416 if (type == REDIS_LIST) {
3417 listAddNodeTail((list*)o->ptr,ele);
3418 } else {
3419 dictAdd((dict*)o->ptr,ele,NULL);
3420 }
3421 }
3422 } else if (type == REDIS_ZSET) {
3423 /* Read list/set value */
3424 uint32_t zsetlen;
3425 zset *zs;
3426
3427 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3428 o = createZsetObject();
3429 zs = o->ptr;
3430 /* Load every single element of the list/set */
3431 while(zsetlen--) {
3432 robj *ele;
3433 double *score = zmalloc(sizeof(double));
3434
3435 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3436 tryObjectEncoding(ele);
3437 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3438 dictAdd(zs->dict,ele,score);
3439 zslInsert(zs->zsl,*score,ele);
3440 incrRefCount(ele); /* added to skiplist */
3441 }
3442 } else {
3443 redisAssert(0 != 0);
3444 }
3445 return o;
3446}
3447
f78fd11b 3448static int rdbLoad(char *filename) {
ed9b544e 3449 FILE *fp;
f78fd11b 3450 robj *keyobj = NULL;
3451 uint32_t dbid;
bb32ede5 3452 int type, retval, rdbver;
3305306f 3453 dict *d = server.db[0].dict;
bb32ede5 3454 redisDb *db = server.db+0;
f78fd11b 3455 char buf[1024];
bb32ede5 3456 time_t expiretime = -1, now = time(NULL);
b492cf00 3457 long long loadedkeys = 0;
bb32ede5 3458
ed9b544e 3459 fp = fopen(filename,"r");
3460 if (!fp) return REDIS_ERR;
3461 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3462 buf[9] = '\0';
3463 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3464 fclose(fp);
3465 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3466 return REDIS_ERR;
3467 }
f78fd11b 3468 rdbver = atoi(buf+5);
c78a8ccc 3469 if (rdbver != 1) {
f78fd11b 3470 fclose(fp);
3471 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3472 return REDIS_ERR;
3473 }
ed9b544e 3474 while(1) {
3475 robj *o;
3476
3477 /* Read type. */
f78fd11b 3478 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3479 if (type == REDIS_EXPIRETIME) {
3480 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3481 /* We read the time so we need to read the object type again */
3482 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3483 }
ed9b544e 3484 if (type == REDIS_EOF) break;
3485 /* Handle SELECT DB opcode as a special case */
3486 if (type == REDIS_SELECTDB) {
c78a8ccc 3487 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3488 goto eoferr;
ed9b544e 3489 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3490 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3491 exit(1);
3492 }
bb32ede5 3493 db = server.db+dbid;
3494 d = db->dict;
ed9b544e 3495 continue;
3496 }
3497 /* Read key */
c78a8ccc 3498 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3499 /* Read value */
3500 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3501 /* Add the new object in the hash table */
f78fd11b 3502 retval = dictAdd(d,keyobj,o);
ed9b544e 3503 if (retval == DICT_ERR) {
f78fd11b 3504 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3505 exit(1);
3506 }
bb32ede5 3507 /* Set the expire time if needed */
3508 if (expiretime != -1) {
3509 setExpire(db,keyobj,expiretime);
3510 /* Delete this key if already expired */
3511 if (expiretime < now) deleteKey(db,keyobj);
3512 expiretime = -1;
3513 }
f78fd11b 3514 keyobj = o = NULL;
b492cf00 3515 /* Handle swapping while loading big datasets when VM is on */
3516 loadedkeys++;
3517 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3518 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3519 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3520 }
3521 }
ed9b544e 3522 }
3523 fclose(fp);
3524 return REDIS_OK;
3525
3526eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3527 if (keyobj) decrRefCount(keyobj);
f80dff62 3528 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3529 exit(1);
3530 return REDIS_ERR; /* Just to avoid warning */
3531}
3532
3533/*================================== Commands =============================== */
3534
abcb223e 3535static void authCommand(redisClient *c) {
2e77c2ee 3536 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3537 c->authenticated = 1;
3538 addReply(c,shared.ok);
3539 } else {
3540 c->authenticated = 0;
fa4c0aba 3541 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3542 }
3543}
3544
ed9b544e 3545static void pingCommand(redisClient *c) {
3546 addReply(c,shared.pong);
3547}
3548
3549static void echoCommand(redisClient *c) {
942a3961 3550 addReplyBulkLen(c,c->argv[1]);
ed9b544e 3551 addReply(c,c->argv[1]);
3552 addReply(c,shared.crlf);
3553}
3554
3555/*=================================== Strings =============================== */
3556
3557static void setGenericCommand(redisClient *c, int nx) {
3558 int retval;
3559
333fd216 3560 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3561 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3562 if (retval == DICT_ERR) {
3563 if (!nx) {
1b03836c 3564 /* If the key is about a swapped value, we want a new key object
3565 * to overwrite the old. So we delete the old key in the database.
3566 * This will also make sure that swap pages about the old object
3567 * will be marked as free. */
ddfaca9d 3568 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
1b03836c 3569 incrRefCount(c->argv[1]);
3305306f 3570 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3571 incrRefCount(c->argv[2]);
3572 } else {
c937aa89 3573 addReply(c,shared.czero);
ed9b544e 3574 return;
3575 }
3576 } else {
3577 incrRefCount(c->argv[1]);
3578 incrRefCount(c->argv[2]);
3579 }
3580 server.dirty++;
3305306f 3581 removeExpire(c->db,c->argv[1]);
c937aa89 3582 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3583}
3584
3585static void setCommand(redisClient *c) {
a4d1ba9a 3586 setGenericCommand(c,0);
ed9b544e 3587}
3588
3589static void setnxCommand(redisClient *c) {
a4d1ba9a 3590 setGenericCommand(c,1);
ed9b544e 3591}
3592
322fc7d8 3593static int getGenericCommand(redisClient *c) {
3305306f 3594 robj *o = lookupKeyRead(c->db,c->argv[1]);
3595
3596 if (o == NULL) {
c937aa89 3597 addReply(c,shared.nullbulk);
322fc7d8 3598 return REDIS_OK;
ed9b544e 3599 } else {
ed9b544e 3600 if (o->type != REDIS_STRING) {
c937aa89 3601 addReply(c,shared.wrongtypeerr);
322fc7d8 3602 return REDIS_ERR;
ed9b544e 3603 } else {
942a3961 3604 addReplyBulkLen(c,o);
ed9b544e 3605 addReply(c,o);
3606 addReply(c,shared.crlf);
322fc7d8 3607 return REDIS_OK;
ed9b544e 3608 }
3609 }
3610}
3611
322fc7d8 3612static void getCommand(redisClient *c) {
3613 getGenericCommand(c);
3614}
3615
f6b141c5 3616static void getsetCommand(redisClient *c) {
322fc7d8 3617 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3618 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3619 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3620 } else {
3621 incrRefCount(c->argv[1]);
3622 }
3623 incrRefCount(c->argv[2]);
3624 server.dirty++;
3625 removeExpire(c->db,c->argv[1]);
3626}
3627
70003d28 3628static void mgetCommand(redisClient *c) {
70003d28 3629 int j;
3630
c937aa89 3631 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3632 for (j = 1; j < c->argc; j++) {
3305306f 3633 robj *o = lookupKeyRead(c->db,c->argv[j]);
3634 if (o == NULL) {
c937aa89 3635 addReply(c,shared.nullbulk);
70003d28 3636 } else {
70003d28 3637 if (o->type != REDIS_STRING) {
c937aa89 3638 addReply(c,shared.nullbulk);
70003d28 3639 } else {
942a3961 3640 addReplyBulkLen(c,o);
70003d28 3641 addReply(c,o);
3642 addReply(c,shared.crlf);
3643 }
3644 }
3645 }
3646}
3647
6c446631 3648static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3649 int j, busykeys = 0;
6c446631 3650
3651 if ((c->argc % 2) == 0) {
454d4e43 3652 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3653 return;
3654 }
3655 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3656 * set nothing at all if at least one already key exists. */
3657 if (nx) {
3658 for (j = 1; j < c->argc; j += 2) {
906573e7 3659 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3660 busykeys++;
6c446631 3661 }
3662 }
3663 }
906573e7 3664 if (busykeys) {
3665 addReply(c, shared.czero);
3666 return;
3667 }
6c446631 3668
3669 for (j = 1; j < c->argc; j += 2) {
3670 int retval;
3671
17511391 3672 tryObjectEncoding(c->argv[j+1]);
6c446631 3673 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3674 if (retval == DICT_ERR) {
3675 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3676 incrRefCount(c->argv[j+1]);
3677 } else {
3678 incrRefCount(c->argv[j]);
3679 incrRefCount(c->argv[j+1]);
3680 }
3681 removeExpire(c->db,c->argv[j]);
3682 }
3683 server.dirty += (c->argc-1)/2;
3684 addReply(c, nx ? shared.cone : shared.ok);
3685}
3686
3687static void msetCommand(redisClient *c) {
3688 msetGenericCommand(c,0);
3689}
3690
3691static void msetnxCommand(redisClient *c) {
3692 msetGenericCommand(c,1);
3693}
3694
d68ed120 3695static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 3696 long long value;
3697 int retval;
3698 robj *o;
3699
3305306f 3700 o = lookupKeyWrite(c->db,c->argv[1]);
3701 if (o == NULL) {
ed9b544e 3702 value = 0;
3703 } else {
ed9b544e 3704 if (o->type != REDIS_STRING) {
3705 value = 0;
3706 } else {
3707 char *eptr;
3708
942a3961 3709 if (o->encoding == REDIS_ENCODING_RAW)
3710 value = strtoll(o->ptr, &eptr, 10);
3711 else if (o->encoding == REDIS_ENCODING_INT)
3712 value = (long)o->ptr;
3713 else
dfc5e96c 3714 redisAssert(1 != 1);
ed9b544e 3715 }
3716 }
3717
3718 value += incr;
3719 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
942a3961 3720 tryObjectEncoding(o);
3305306f 3721 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 3722 if (retval == DICT_ERR) {
3305306f 3723 dictReplace(c->db->dict,c->argv[1],o);
3724 removeExpire(c->db,c->argv[1]);
ed9b544e 3725 } else {
3726 incrRefCount(c->argv[1]);
3727 }
3728 server.dirty++;
c937aa89 3729 addReply(c,shared.colon);
ed9b544e 3730 addReply(c,o);
3731 addReply(c,shared.crlf);
3732}
3733
3734static void incrCommand(redisClient *c) {
a4d1ba9a 3735 incrDecrCommand(c,1);
ed9b544e 3736}
3737
3738static void decrCommand(redisClient *c) {
a4d1ba9a 3739 incrDecrCommand(c,-1);
ed9b544e 3740}
3741
3742static void incrbyCommand(redisClient *c) {
d68ed120 3743 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3744 incrDecrCommand(c,incr);
ed9b544e 3745}
3746
3747static void decrbyCommand(redisClient *c) {
d68ed120 3748 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3749 incrDecrCommand(c,-incr);
ed9b544e 3750}
3751
4b00bebd 3752static void appendCommand(redisClient *c) {
3753 int retval;
3754 size_t totlen;
3755 robj *o;
3756
3757 o = lookupKeyWrite(c->db,c->argv[1]);
3758 if (o == NULL) {
3759 /* Create the key */
3760 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3761 incrRefCount(c->argv[1]);
3762 incrRefCount(c->argv[2]);
3763 totlen = stringObjectLen(c->argv[2]);
3764 } else {
3765 dictEntry *de;
3766
3767 de = dictFind(c->db->dict,c->argv[1]);
3768 assert(de != NULL);
3769
3770 o = dictGetEntryVal(de);
3771 if (o->type != REDIS_STRING) {
3772 addReply(c,shared.wrongtypeerr);
3773 return;
3774 }
3775 /* If the object is specially encoded or shared we have to make
3776 * a copy */
3777 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3778 robj *decoded = getDecodedObject(o);
3779
3780 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3781 decrRefCount(decoded);
3782 dictReplace(c->db->dict,c->argv[1],o);
3783 }
3784 /* APPEND! */
3785 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3786 o->ptr = sdscatlen(o->ptr,
3787 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3788 } else {
3789 o->ptr = sdscatprintf(o->ptr, "%ld",
3790 (unsigned long) c->argv[2]->ptr);
3791 }
3792 totlen = sdslen(o->ptr);
3793 }
3794 server.dirty++;
3795 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3796}
3797
39191553 3798static void substrCommand(redisClient *c) {
3799 robj *o;
3800 long start = atoi(c->argv[2]->ptr);
3801 long end = atoi(c->argv[3]->ptr);
3802
3803 o = lookupKeyRead(c->db,c->argv[1]);
3804 if (o == NULL) {
3805 addReply(c,shared.nullbulk);
3806 } else {
3807 if (o->type != REDIS_STRING) {
3808 addReply(c,shared.wrongtypeerr);
3809 } else {
8fe7fad7 3810 size_t rangelen, strlen;
39191553 3811 sds range;
3812
8fe7fad7 3813 o = getDecodedObject(o);
3814 strlen = sdslen(o->ptr);
3815
39191553 3816 /* convert negative indexes */
3817 if (start < 0) start = strlen+start;
3818 if (end < 0) end = strlen+end;
3819 if (start < 0) start = 0;
3820 if (end < 0) end = 0;
3821
3822 /* indexes sanity checks */
3823 if (start > end || (size_t)start >= strlen) {
3824 /* Out of range start or start > end result in null reply */
3825 addReply(c,shared.nullbulk);
8fe7fad7 3826 decrRefCount(o);
39191553 3827 return;
3828 }
3829 if ((size_t)end >= strlen) end = strlen-1;
3830 rangelen = (end-start)+1;
3831
3832 /* Return the result */
3833 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",rangelen));
3834 range = sdsnewlen((char*)o->ptr+start,rangelen);
3835 addReplySds(c,range);
3836 addReply(c,shared.crlf);
8fe7fad7 3837 decrRefCount(o);
39191553 3838 }
3839 }
3840}
3841
ed9b544e 3842/* ========================= Type agnostic commands ========================= */
3843
3844static void delCommand(redisClient *c) {
5109cdff 3845 int deleted = 0, j;
3846
3847 for (j = 1; j < c->argc; j++) {
3848 if (deleteKey(c->db,c->argv[j])) {
3849 server.dirty++;
3850 deleted++;
3851 }
3852 }
3853 switch(deleted) {
3854 case 0:
c937aa89 3855 addReply(c,shared.czero);
5109cdff 3856 break;
3857 case 1:
3858 addReply(c,shared.cone);
3859 break;
3860 default:
3861 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3862 break;
ed9b544e 3863 }
3864}
3865
3866static void existsCommand(redisClient *c) {
3305306f 3867 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 3868}
3869
3870static void selectCommand(redisClient *c) {
3871 int id = atoi(c->argv[1]->ptr);
3872
3873 if (selectDb(c,id) == REDIS_ERR) {
774e3047 3874 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 3875 } else {
3876 addReply(c,shared.ok);
3877 }
3878}
3879
3880static void randomkeyCommand(redisClient *c) {
3881 dictEntry *de;
3305306f 3882
3883 while(1) {
3884 de = dictGetRandomKey(c->db->dict);
ce7bef07 3885 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 3886 }
ed9b544e 3887 if (de == NULL) {
ce7bef07 3888 addReply(c,shared.plus);
ed9b544e 3889 addReply(c,shared.crlf);
3890 } else {
c937aa89 3891 addReply(c,shared.plus);
ed9b544e 3892 addReply(c,dictGetEntryKey(de));
3893 addReply(c,shared.crlf);
3894 }
3895}
3896
3897static void keysCommand(redisClient *c) {
3898 dictIterator *di;
3899 dictEntry *de;
3900 sds pattern = c->argv[1]->ptr;
3901 int plen = sdslen(pattern);
a3f9eec2 3902 unsigned long numkeys = 0;
ed9b544e 3903 robj *lenobj = createObject(REDIS_STRING,NULL);
3904
3305306f 3905 di = dictGetIterator(c->db->dict);
ed9b544e 3906 addReply(c,lenobj);
3907 decrRefCount(lenobj);
3908 while((de = dictNext(di)) != NULL) {
3909 robj *keyobj = dictGetEntryKey(de);
3305306f 3910
ed9b544e 3911 sds key = keyobj->ptr;
3912 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3913 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 3914 if (expireIfNeeded(c->db,keyobj) == 0) {
a3f9eec2 3915 addReplyBulkLen(c,keyobj);
3305306f 3916 addReply(c,keyobj);
a3f9eec2 3917 addReply(c,shared.crlf);
3305306f 3918 numkeys++;
3305306f 3919 }
ed9b544e 3920 }
3921 }
3922 dictReleaseIterator(di);
a3f9eec2 3923 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 3924}
3925
3926static void dbsizeCommand(redisClient *c) {
3927 addReplySds(c,
3305306f 3928 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 3929}
3930
3931static void lastsaveCommand(redisClient *c) {
3932 addReplySds(c,
c937aa89 3933 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 3934}
3935
3936static void typeCommand(redisClient *c) {
3305306f 3937 robj *o;
ed9b544e 3938 char *type;
3305306f 3939
3940 o = lookupKeyRead(c->db,c->argv[1]);
3941 if (o == NULL) {
c937aa89 3942 type = "+none";
ed9b544e 3943 } else {
ed9b544e 3944 switch(o->type) {
c937aa89 3945 case REDIS_STRING: type = "+string"; break;
3946 case REDIS_LIST: type = "+list"; break;
3947 case REDIS_SET: type = "+set"; break;
412a8bce 3948 case REDIS_ZSET: type = "+zset"; break;
ed9b544e 3949 default: type = "unknown"; break;
3950 }
3951 }
3952 addReplySds(c,sdsnew(type));
3953 addReply(c,shared.crlf);
3954}
3955
3956static void saveCommand(redisClient *c) {
9d65a1bb 3957 if (server.bgsavechildpid != -1) {
05557f6d 3958 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3959 return;
3960 }
f78fd11b 3961 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 3962 addReply(c,shared.ok);
3963 } else {
3964 addReply(c,shared.err);
3965 }
3966}
3967
3968static void bgsaveCommand(redisClient *c) {
9d65a1bb 3969 if (server.bgsavechildpid != -1) {
ed9b544e 3970 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
3971 return;
3972 }
f78fd11b 3973 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 3974 char *status = "+Background saving started\r\n";
3975 addReplySds(c,sdsnew(status));
ed9b544e 3976 } else {
3977 addReply(c,shared.err);
3978 }
3979}
3980
3981static void shutdownCommand(redisClient *c) {
3982 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 3983 /* Kill the saving child if there is a background saving in progress.
3984 We want to avoid race conditions, for instance our saving child may
3985 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 3986 if (server.bgsavechildpid != -1) {
9f3c422c 3987 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
3988 kill(server.bgsavechildpid,SIGKILL);
a3b21203 3989 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 3990 }
ac945e2d 3991 if (server.appendonly) {
3992 /* Append only file: fsync() the AOF and exit */
3993 fsync(server.appendfd);
054e426d 3994 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 3995 exit(0);
ed9b544e 3996 } else {
ac945e2d 3997 /* Snapshotting. Perform a SYNC SAVE and exit */
3998 if (rdbSave(server.dbfilename) == REDIS_OK) {
3999 if (server.daemonize)
4000 unlink(server.pidfile);
4001 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4002 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 4003 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4004 exit(0);
4005 } else {
4006 /* Ooops.. error saving! The best we can do is to continue operating.
4007 * Note that if there was a background saving process, in the next
4008 * cron() Redis will be notified that the background saving aborted,
4009 * handling special stuff like slaves pending for synchronization... */
4010 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4011 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4012 }
ed9b544e 4013 }
4014}
4015
4016static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4017 robj *o;
4018
4019 /* To use the same key as src and dst is probably an error */
4020 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4021 addReply(c,shared.sameobjecterr);
ed9b544e 4022 return;
4023 }
4024
3305306f 4025 o = lookupKeyWrite(c->db,c->argv[1]);
4026 if (o == NULL) {
c937aa89 4027 addReply(c,shared.nokeyerr);
ed9b544e 4028 return;
4029 }
ed9b544e 4030 incrRefCount(o);
3305306f 4031 deleteIfVolatile(c->db,c->argv[2]);
4032 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4033 if (nx) {
4034 decrRefCount(o);
c937aa89 4035 addReply(c,shared.czero);
ed9b544e 4036 return;
4037 }
3305306f 4038 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4039 } else {
4040 incrRefCount(c->argv[2]);
4041 }
3305306f 4042 deleteKey(c->db,c->argv[1]);
ed9b544e 4043 server.dirty++;
c937aa89 4044 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4045}
4046
4047static void renameCommand(redisClient *c) {
4048 renameGenericCommand(c,0);
4049}
4050
4051static void renamenxCommand(redisClient *c) {
4052 renameGenericCommand(c,1);
4053}
4054
4055static void moveCommand(redisClient *c) {
3305306f 4056 robj *o;
4057 redisDb *src, *dst;
ed9b544e 4058 int srcid;
4059
4060 /* Obtain source and target DB pointers */
3305306f 4061 src = c->db;
4062 srcid = c->db->id;
ed9b544e 4063 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4064 addReply(c,shared.outofrangeerr);
ed9b544e 4065 return;
4066 }
3305306f 4067 dst = c->db;
4068 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4069
4070 /* If the user is moving using as target the same
4071 * DB as the source DB it is probably an error. */
4072 if (src == dst) {
c937aa89 4073 addReply(c,shared.sameobjecterr);
ed9b544e 4074 return;
4075 }
4076
4077 /* Check if the element exists and get a reference */
3305306f 4078 o = lookupKeyWrite(c->db,c->argv[1]);
4079 if (!o) {
c937aa89 4080 addReply(c,shared.czero);
ed9b544e 4081 return;
4082 }
4083
4084 /* Try to add the element to the target DB */
3305306f 4085 deleteIfVolatile(dst,c->argv[1]);
4086 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4087 addReply(c,shared.czero);
ed9b544e 4088 return;
4089 }
3305306f 4090 incrRefCount(c->argv[1]);
ed9b544e 4091 incrRefCount(o);
4092
4093 /* OK! key moved, free the entry in the source DB */
3305306f 4094 deleteKey(src,c->argv[1]);
ed9b544e 4095 server.dirty++;
c937aa89 4096 addReply(c,shared.cone);
ed9b544e 4097}
4098
4099/* =================================== Lists ================================ */
4100static void pushGenericCommand(redisClient *c, int where) {
4101 robj *lobj;
ed9b544e 4102 list *list;
3305306f 4103
4104 lobj = lookupKeyWrite(c->db,c->argv[1]);
4105 if (lobj == NULL) {
95242ab5 4106 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4107 addReply(c,shared.cone);
95242ab5 4108 return;
4109 }
ed9b544e 4110 lobj = createListObject();
4111 list = lobj->ptr;
4112 if (where == REDIS_HEAD) {
6b47e12e 4113 listAddNodeHead(list,c->argv[2]);
ed9b544e 4114 } else {
6b47e12e 4115 listAddNodeTail(list,c->argv[2]);
ed9b544e 4116 }
3305306f 4117 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4118 incrRefCount(c->argv[1]);
4119 incrRefCount(c->argv[2]);
4120 } else {
ed9b544e 4121 if (lobj->type != REDIS_LIST) {
4122 addReply(c,shared.wrongtypeerr);
4123 return;
4124 }
95242ab5 4125 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4126 addReply(c,shared.cone);
95242ab5 4127 return;
4128 }
ed9b544e 4129 list = lobj->ptr;
4130 if (where == REDIS_HEAD) {
6b47e12e 4131 listAddNodeHead(list,c->argv[2]);
ed9b544e 4132 } else {
6b47e12e 4133 listAddNodeTail(list,c->argv[2]);
ed9b544e 4134 }
4135 incrRefCount(c->argv[2]);
4136 }
4137 server.dirty++;
520b5a33 4138 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4139}
4140
4141static void lpushCommand(redisClient *c) {
4142 pushGenericCommand(c,REDIS_HEAD);
4143}
4144
4145static void rpushCommand(redisClient *c) {
4146 pushGenericCommand(c,REDIS_TAIL);
4147}
4148
4149static void llenCommand(redisClient *c) {
3305306f 4150 robj *o;
ed9b544e 4151 list *l;
4152
3305306f 4153 o = lookupKeyRead(c->db,c->argv[1]);
4154 if (o == NULL) {
c937aa89 4155 addReply(c,shared.czero);
ed9b544e 4156 return;
4157 } else {
ed9b544e 4158 if (o->type != REDIS_LIST) {
c937aa89 4159 addReply(c,shared.wrongtypeerr);
ed9b544e 4160 } else {
4161 l = o->ptr;
c937aa89 4162 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
ed9b544e 4163 }
4164 }
4165}
4166
4167static void lindexCommand(redisClient *c) {
3305306f 4168 robj *o;
ed9b544e 4169 int index = atoi(c->argv[2]->ptr);
4170
3305306f 4171 o = lookupKeyRead(c->db,c->argv[1]);
4172 if (o == NULL) {
c937aa89 4173 addReply(c,shared.nullbulk);
ed9b544e 4174 } else {
ed9b544e 4175 if (o->type != REDIS_LIST) {
c937aa89 4176 addReply(c,shared.wrongtypeerr);
ed9b544e 4177 } else {
4178 list *list = o->ptr;
4179 listNode *ln;
4180
4181 ln = listIndex(list, index);
4182 if (ln == NULL) {
c937aa89 4183 addReply(c,shared.nullbulk);
ed9b544e 4184 } else {
4185 robj *ele = listNodeValue(ln);
942a3961 4186 addReplyBulkLen(c,ele);
ed9b544e 4187 addReply(c,ele);
4188 addReply(c,shared.crlf);
4189 }
4190 }
4191 }
4192}
4193
4194static void lsetCommand(redisClient *c) {
3305306f 4195 robj *o;
ed9b544e 4196 int index = atoi(c->argv[2]->ptr);
4197
3305306f 4198 o = lookupKeyWrite(c->db,c->argv[1]);
4199 if (o == NULL) {
ed9b544e 4200 addReply(c,shared.nokeyerr);
4201 } else {
ed9b544e 4202 if (o->type != REDIS_LIST) {
4203 addReply(c,shared.wrongtypeerr);
4204 } else {
4205 list *list = o->ptr;
4206 listNode *ln;
4207
4208 ln = listIndex(list, index);
4209 if (ln == NULL) {
c937aa89 4210 addReply(c,shared.outofrangeerr);
ed9b544e 4211 } else {
4212 robj *ele = listNodeValue(ln);
4213
4214 decrRefCount(ele);
4215 listNodeValue(ln) = c->argv[3];
4216 incrRefCount(c->argv[3]);
4217 addReply(c,shared.ok);
4218 server.dirty++;
4219 }
4220 }
4221 }
4222}
4223
4224static void popGenericCommand(redisClient *c, int where) {
3305306f 4225 robj *o;
4226
4227 o = lookupKeyWrite(c->db,c->argv[1]);
4228 if (o == NULL) {
c937aa89 4229 addReply(c,shared.nullbulk);
ed9b544e 4230 } else {
ed9b544e 4231 if (o->type != REDIS_LIST) {
c937aa89 4232 addReply(c,shared.wrongtypeerr);
ed9b544e 4233 } else {
4234 list *list = o->ptr;
4235 listNode *ln;
4236
4237 if (where == REDIS_HEAD)
4238 ln = listFirst(list);
4239 else
4240 ln = listLast(list);
4241
4242 if (ln == NULL) {
c937aa89 4243 addReply(c,shared.nullbulk);
ed9b544e 4244 } else {
4245 robj *ele = listNodeValue(ln);
942a3961 4246 addReplyBulkLen(c,ele);
ed9b544e 4247 addReply(c,ele);
4248 addReply(c,shared.crlf);
4249 listDelNode(list,ln);
4250 server.dirty++;
4251 }
4252 }
4253 }
4254}
4255
4256static void lpopCommand(redisClient *c) {
4257 popGenericCommand(c,REDIS_HEAD);
4258}
4259
4260static void rpopCommand(redisClient *c) {
4261 popGenericCommand(c,REDIS_TAIL);
4262}
4263
4264static void lrangeCommand(redisClient *c) {
3305306f 4265 robj *o;
ed9b544e 4266 int start = atoi(c->argv[2]->ptr);
4267 int end = atoi(c->argv[3]->ptr);
3305306f 4268
4269 o = lookupKeyRead(c->db,c->argv[1]);
4270 if (o == NULL) {
c937aa89 4271 addReply(c,shared.nullmultibulk);
ed9b544e 4272 } else {
ed9b544e 4273 if (o->type != REDIS_LIST) {
c937aa89 4274 addReply(c,shared.wrongtypeerr);
ed9b544e 4275 } else {
4276 list *list = o->ptr;
4277 listNode *ln;
4278 int llen = listLength(list);
4279 int rangelen, j;
4280 robj *ele;
4281
4282 /* convert negative indexes */
4283 if (start < 0) start = llen+start;
4284 if (end < 0) end = llen+end;
4285 if (start < 0) start = 0;
4286 if (end < 0) end = 0;
4287
4288 /* indexes sanity checks */
4289 if (start > end || start >= llen) {
4290 /* Out of range start or start > end result in empty list */
c937aa89 4291 addReply(c,shared.emptymultibulk);
ed9b544e 4292 return;
4293 }
4294 if (end >= llen) end = llen-1;
4295 rangelen = (end-start)+1;
4296
4297 /* Return the result in form of a multi-bulk reply */
4298 ln = listIndex(list, start);
c937aa89 4299 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
ed9b544e 4300 for (j = 0; j < rangelen; j++) {
4301 ele = listNodeValue(ln);
942a3961 4302 addReplyBulkLen(c,ele);
ed9b544e 4303 addReply(c,ele);
4304 addReply(c,shared.crlf);
4305 ln = ln->next;
4306 }
4307 }
4308 }
4309}
4310
4311static void ltrimCommand(redisClient *c) {
3305306f 4312 robj *o;
ed9b544e 4313 int start = atoi(c->argv[2]->ptr);
4314 int end = atoi(c->argv[3]->ptr);
4315
3305306f 4316 o = lookupKeyWrite(c->db,c->argv[1]);
4317 if (o == NULL) {
ab9d4cb1 4318 addReply(c,shared.ok);
ed9b544e 4319 } else {
ed9b544e 4320 if (o->type != REDIS_LIST) {
4321 addReply(c,shared.wrongtypeerr);
4322 } else {
4323 list *list = o->ptr;
4324 listNode *ln;
4325 int llen = listLength(list);
4326 int j, ltrim, rtrim;
4327
4328 /* convert negative indexes */
4329 if (start < 0) start = llen+start;
4330 if (end < 0) end = llen+end;
4331 if (start < 0) start = 0;
4332 if (end < 0) end = 0;
4333
4334 /* indexes sanity checks */
4335 if (start > end || start >= llen) {
4336 /* Out of range start or start > end result in empty list */
4337 ltrim = llen;
4338 rtrim = 0;
4339 } else {
4340 if (end >= llen) end = llen-1;
4341 ltrim = start;
4342 rtrim = llen-end-1;
4343 }
4344
4345 /* Remove list elements to perform the trim */
4346 for (j = 0; j < ltrim; j++) {
4347 ln = listFirst(list);
4348 listDelNode(list,ln);
4349 }
4350 for (j = 0; j < rtrim; j++) {
4351 ln = listLast(list);
4352 listDelNode(list,ln);
4353 }
ed9b544e 4354 server.dirty++;
e59229a2 4355 addReply(c,shared.ok);
ed9b544e 4356 }
4357 }
4358}
4359
4360static void lremCommand(redisClient *c) {
3305306f 4361 robj *o;
ed9b544e 4362
3305306f 4363 o = lookupKeyWrite(c->db,c->argv[1]);
4364 if (o == NULL) {
33c08b39 4365 addReply(c,shared.czero);
ed9b544e 4366 } else {
ed9b544e 4367 if (o->type != REDIS_LIST) {
c937aa89 4368 addReply(c,shared.wrongtypeerr);
ed9b544e 4369 } else {
4370 list *list = o->ptr;
4371 listNode *ln, *next;
4372 int toremove = atoi(c->argv[2]->ptr);
4373 int removed = 0;
4374 int fromtail = 0;
4375
4376 if (toremove < 0) {
4377 toremove = -toremove;
4378 fromtail = 1;
4379 }
4380 ln = fromtail ? list->tail : list->head;
4381 while (ln) {
ed9b544e 4382 robj *ele = listNodeValue(ln);
a4d1ba9a 4383
4384 next = fromtail ? ln->prev : ln->next;
724a51b1 4385 if (compareStringObjects(ele,c->argv[3]) == 0) {
ed9b544e 4386 listDelNode(list,ln);
4387 server.dirty++;
4388 removed++;
4389 if (toremove && removed == toremove) break;
4390 }
4391 ln = next;
4392 }
c937aa89 4393 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4394 }
4395 }
4396}
4397
12f9d551 4398/* This is the semantic of this command:
0f5f7e9a 4399 * RPOPLPUSH srclist dstlist:
12f9d551 4400 * IF LLEN(srclist) > 0
4401 * element = RPOP srclist
4402 * LPUSH dstlist element
4403 * RETURN element
4404 * ELSE
4405 * RETURN nil
4406 * END
4407 * END
4408 *
4409 * The idea is to be able to get an element from a list in a reliable way
4410 * since the element is not just returned but pushed against another list
4411 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4412 */
0f5f7e9a 4413static void rpoplpushcommand(redisClient *c) {
12f9d551 4414 robj *sobj;
4415
4416 sobj = lookupKeyWrite(c->db,c->argv[1]);
4417 if (sobj == NULL) {
4418 addReply(c,shared.nullbulk);
4419 } else {
4420 if (sobj->type != REDIS_LIST) {
4421 addReply(c,shared.wrongtypeerr);
4422 } else {
4423 list *srclist = sobj->ptr;
4424 listNode *ln = listLast(srclist);
4425
4426 if (ln == NULL) {
4427 addReply(c,shared.nullbulk);
4428 } else {
4429 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4430 robj *ele = listNodeValue(ln);
4431 list *dstlist;
4432
e20fb74f 4433 if (dobj && dobj->type != REDIS_LIST) {
12f9d551 4434 addReply(c,shared.wrongtypeerr);
4435 return;
4436 }
e20fb74f 4437
4438 /* Add the element to the target list (unless it's directly
4439 * passed to some BLPOP-ing client */
4440 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4441 if (dobj == NULL) {
4442 /* Create the list if the key does not exist */
4443 dobj = createListObject();
4444 dictAdd(c->db->dict,c->argv[2],dobj);
4445 incrRefCount(c->argv[2]);
4446 }
4447 dstlist = dobj->ptr;
4448 listAddNodeHead(dstlist,ele);
4449 incrRefCount(ele);
4450 }
12f9d551 4451
4452 /* Send the element to the client as reply as well */
4453 addReplyBulkLen(c,ele);
4454 addReply(c,ele);
4455 addReply(c,shared.crlf);
4456
4457 /* Finally remove the element from the source list */
4458 listDelNode(srclist,ln);
4459 server.dirty++;
4460 }
4461 }
4462 }
4463}
4464
4465
ed9b544e 4466/* ==================================== Sets ================================ */
4467
4468static void saddCommand(redisClient *c) {
ed9b544e 4469 robj *set;
4470
3305306f 4471 set = lookupKeyWrite(c->db,c->argv[1]);
4472 if (set == NULL) {
ed9b544e 4473 set = createSetObject();
3305306f 4474 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4475 incrRefCount(c->argv[1]);
4476 } else {
ed9b544e 4477 if (set->type != REDIS_SET) {
c937aa89 4478 addReply(c,shared.wrongtypeerr);
ed9b544e 4479 return;
4480 }
4481 }
4482 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4483 incrRefCount(c->argv[2]);
4484 server.dirty++;
c937aa89 4485 addReply(c,shared.cone);
ed9b544e 4486 } else {
c937aa89 4487 addReply(c,shared.czero);
ed9b544e 4488 }
4489}
4490
4491static void sremCommand(redisClient *c) {
3305306f 4492 robj *set;
ed9b544e 4493
3305306f 4494 set = lookupKeyWrite(c->db,c->argv[1]);
4495 if (set == NULL) {
c937aa89 4496 addReply(c,shared.czero);
ed9b544e 4497 } else {
ed9b544e 4498 if (set->type != REDIS_SET) {
c937aa89 4499 addReply(c,shared.wrongtypeerr);
ed9b544e 4500 return;
4501 }
4502 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4503 server.dirty++;
12fea928 4504 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
c937aa89 4505 addReply(c,shared.cone);
ed9b544e 4506 } else {
c937aa89 4507 addReply(c,shared.czero);
ed9b544e 4508 }
4509 }
4510}
4511
a4460ef4 4512static void smoveCommand(redisClient *c) {
4513 robj *srcset, *dstset;
4514
4515 srcset = lookupKeyWrite(c->db,c->argv[1]);
4516 dstset = lookupKeyWrite(c->db,c->argv[2]);
4517
4518 /* If the source key does not exist return 0, if it's of the wrong type
4519 * raise an error */
4520 if (srcset == NULL || srcset->type != REDIS_SET) {
4521 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4522 return;
4523 }
4524 /* Error if the destination key is not a set as well */
4525 if (dstset && dstset->type != REDIS_SET) {
4526 addReply(c,shared.wrongtypeerr);
4527 return;
4528 }
4529 /* Remove the element from the source set */
4530 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4531 /* Key not found in the src set! return zero */
4532 addReply(c,shared.czero);
4533 return;
4534 }
4535 server.dirty++;
4536 /* Add the element to the destination set */
4537 if (!dstset) {
4538 dstset = createSetObject();
4539 dictAdd(c->db->dict,c->argv[2],dstset);
4540 incrRefCount(c->argv[2]);
4541 }
4542 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4543 incrRefCount(c->argv[3]);
4544 addReply(c,shared.cone);
4545}
4546
ed9b544e 4547static void sismemberCommand(redisClient *c) {
3305306f 4548 robj *set;
ed9b544e 4549
3305306f 4550 set = lookupKeyRead(c->db,c->argv[1]);
4551 if (set == NULL) {
c937aa89 4552 addReply(c,shared.czero);
ed9b544e 4553 } else {
ed9b544e 4554 if (set->type != REDIS_SET) {
c937aa89 4555 addReply(c,shared.wrongtypeerr);
ed9b544e 4556 return;
4557 }
4558 if (dictFind(set->ptr,c->argv[2]))
c937aa89 4559 addReply(c,shared.cone);
ed9b544e 4560 else
c937aa89 4561 addReply(c,shared.czero);
ed9b544e 4562 }
4563}
4564
4565static void scardCommand(redisClient *c) {
3305306f 4566 robj *o;
ed9b544e 4567 dict *s;
4568
3305306f 4569 o = lookupKeyRead(c->db,c->argv[1]);
4570 if (o == NULL) {
c937aa89 4571 addReply(c,shared.czero);
ed9b544e 4572 return;
4573 } else {
ed9b544e 4574 if (o->type != REDIS_SET) {
c937aa89 4575 addReply(c,shared.wrongtypeerr);
ed9b544e 4576 } else {
4577 s = o->ptr;
682ac724 4578 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
3305306f 4579 dictSize(s)));
ed9b544e 4580 }
4581 }
4582}
4583
12fea928 4584static void spopCommand(redisClient *c) {
4585 robj *set;
4586 dictEntry *de;
4587
4588 set = lookupKeyWrite(c->db,c->argv[1]);
4589 if (set == NULL) {
4590 addReply(c,shared.nullbulk);
4591 } else {
4592 if (set->type != REDIS_SET) {
4593 addReply(c,shared.wrongtypeerr);
4594 return;
4595 }
4596 de = dictGetRandomKey(set->ptr);
4597 if (de == NULL) {
4598 addReply(c,shared.nullbulk);
4599 } else {
4600 robj *ele = dictGetEntryKey(de);
4601
942a3961 4602 addReplyBulkLen(c,ele);
12fea928 4603 addReply(c,ele);
4604 addReply(c,shared.crlf);
4605 dictDelete(set->ptr,ele);
4606 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4607 server.dirty++;
4608 }
4609 }
4610}
4611
2abb95a9 4612static void srandmemberCommand(redisClient *c) {
4613 robj *set;
4614 dictEntry *de;
4615
4616 set = lookupKeyRead(c->db,c->argv[1]);
4617 if (set == NULL) {
4618 addReply(c,shared.nullbulk);
4619 } else {
4620 if (set->type != REDIS_SET) {
4621 addReply(c,shared.wrongtypeerr);
4622 return;
4623 }
4624 de = dictGetRandomKey(set->ptr);
4625 if (de == NULL) {
4626 addReply(c,shared.nullbulk);
4627 } else {
4628 robj *ele = dictGetEntryKey(de);
4629
4630 addReplyBulkLen(c,ele);
4631 addReply(c,ele);
4632 addReply(c,shared.crlf);
4633 }
4634 }
4635}
4636
ed9b544e 4637static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4638 dict **d1 = (void*) s1, **d2 = (void*) s2;
4639
3305306f 4640 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4641}
4642
682ac724 4643static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4644 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4645 dictIterator *di;
4646 dictEntry *de;
4647 robj *lenobj = NULL, *dstset = NULL;
682ac724 4648 unsigned long j, cardinality = 0;
ed9b544e 4649
ed9b544e 4650 for (j = 0; j < setsnum; j++) {
4651 robj *setobj;
3305306f 4652
4653 setobj = dstkey ?
4654 lookupKeyWrite(c->db,setskeys[j]) :
4655 lookupKeyRead(c->db,setskeys[j]);
4656 if (!setobj) {
ed9b544e 4657 zfree(dv);
5faa6025 4658 if (dstkey) {
fdcaae84 4659 if (deleteKey(c->db,dstkey))
4660 server.dirty++;
0d36ded0 4661 addReply(c,shared.czero);
5faa6025 4662 } else {
4663 addReply(c,shared.nullmultibulk);
4664 }
ed9b544e 4665 return;
4666 }
ed9b544e 4667 if (setobj->type != REDIS_SET) {
4668 zfree(dv);
c937aa89 4669 addReply(c,shared.wrongtypeerr);
ed9b544e 4670 return;
4671 }
4672 dv[j] = setobj->ptr;
4673 }
4674 /* Sort sets from the smallest to largest, this will improve our
4675 * algorithm's performace */
4676 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4677
4678 /* The first thing we should output is the total number of elements...
4679 * since this is a multi-bulk write, but at this stage we don't know
4680 * the intersection set size, so we use a trick, append an empty object
4681 * to the output list and save the pointer to later modify it with the
4682 * right length */
4683 if (!dstkey) {
4684 lenobj = createObject(REDIS_STRING,NULL);
4685 addReply(c,lenobj);
4686 decrRefCount(lenobj);
4687 } else {
4688 /* If we have a target key where to store the resulting set
4689 * create this key with an empty set inside */
4690 dstset = createSetObject();
ed9b544e 4691 }
4692
4693 /* Iterate all the elements of the first (smallest) set, and test
4694 * the element against all the other sets, if at least one set does
4695 * not include the element it is discarded */
4696 di = dictGetIterator(dv[0]);
ed9b544e 4697
4698 while((de = dictNext(di)) != NULL) {
4699 robj *ele;
4700
4701 for (j = 1; j < setsnum; j++)
4702 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4703 if (j != setsnum)
4704 continue; /* at least one set does not contain the member */
4705 ele = dictGetEntryKey(de);
4706 if (!dstkey) {
942a3961 4707 addReplyBulkLen(c,ele);
ed9b544e 4708 addReply(c,ele);
4709 addReply(c,shared.crlf);
4710 cardinality++;
4711 } else {
4712 dictAdd(dstset->ptr,ele,NULL);
4713 incrRefCount(ele);
4714 }
4715 }
4716 dictReleaseIterator(di);
4717
83cdfe18
AG
4718 if (dstkey) {
4719 /* Store the resulting set into the target */
4720 deleteKey(c->db,dstkey);
4721 dictAdd(c->db->dict,dstkey,dstset);
4722 incrRefCount(dstkey);
4723 }
4724
40d224a9 4725 if (!dstkey) {
682ac724 4726 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4727 } else {
682ac724 4728 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4729 dictSize((dict*)dstset->ptr)));
40d224a9 4730 server.dirty++;
4731 }
ed9b544e 4732 zfree(dv);
4733}
4734
4735static void sinterCommand(redisClient *c) {
4736 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4737}
4738
4739static void sinterstoreCommand(redisClient *c) {
4740 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4741}
4742
f4f56e1d 4743#define REDIS_OP_UNION 0
4744#define REDIS_OP_DIFF 1
4745
4746static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4747 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4748 dictIterator *di;
4749 dictEntry *de;
f4f56e1d 4750 robj *dstset = NULL;
40d224a9 4751 int j, cardinality = 0;
4752
40d224a9 4753 for (j = 0; j < setsnum; j++) {
4754 robj *setobj;
4755
4756 setobj = dstkey ?
4757 lookupKeyWrite(c->db,setskeys[j]) :
4758 lookupKeyRead(c->db,setskeys[j]);
4759 if (!setobj) {
4760 dv[j] = NULL;
4761 continue;
4762 }
4763 if (setobj->type != REDIS_SET) {
4764 zfree(dv);
4765 addReply(c,shared.wrongtypeerr);
4766 return;
4767 }
4768 dv[j] = setobj->ptr;
4769 }
4770
4771 /* We need a temp set object to store our union. If the dstkey
4772 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4773 * this set object will be the resulting object to set into the target key*/
4774 dstset = createSetObject();
4775
40d224a9 4776 /* Iterate all the elements of all the sets, add every element a single
4777 * time to the result set */
4778 for (j = 0; j < setsnum; j++) {
51829ed3 4779 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4780 if (!dv[j]) continue; /* non existing keys are like empty sets */
4781
4782 di = dictGetIterator(dv[j]);
40d224a9 4783
4784 while((de = dictNext(di)) != NULL) {
4785 robj *ele;
4786
4787 /* dictAdd will not add the same element multiple times */
4788 ele = dictGetEntryKey(de);
f4f56e1d 4789 if (op == REDIS_OP_UNION || j == 0) {
4790 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4791 incrRefCount(ele);
40d224a9 4792 cardinality++;
4793 }
f4f56e1d 4794 } else if (op == REDIS_OP_DIFF) {
4795 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4796 cardinality--;
4797 }
40d224a9 4798 }
4799 }
4800 dictReleaseIterator(di);
51829ed3
AG
4801
4802 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
40d224a9 4803 }
4804
f4f56e1d 4805 /* Output the content of the resulting set, if not in STORE mode */
4806 if (!dstkey) {
4807 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4808 di = dictGetIterator(dstset->ptr);
f4f56e1d 4809 while((de = dictNext(di)) != NULL) {
4810 robj *ele;
4811
4812 ele = dictGetEntryKey(de);
942a3961 4813 addReplyBulkLen(c,ele);
f4f56e1d 4814 addReply(c,ele);
4815 addReply(c,shared.crlf);
4816 }
4817 dictReleaseIterator(di);
83cdfe18
AG
4818 } else {
4819 /* If we have a target key where to store the resulting set
4820 * create this key with the result set inside */
4821 deleteKey(c->db,dstkey);
4822 dictAdd(c->db->dict,dstkey,dstset);
4823 incrRefCount(dstkey);
f4f56e1d 4824 }
4825
4826 /* Cleanup */
40d224a9 4827 if (!dstkey) {
40d224a9 4828 decrRefCount(dstset);
4829 } else {
682ac724 4830 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4831 dictSize((dict*)dstset->ptr)));
40d224a9 4832 server.dirty++;
4833 }
4834 zfree(dv);
4835}
4836
4837static void sunionCommand(redisClient *c) {
f4f56e1d 4838 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 4839}
4840
4841static void sunionstoreCommand(redisClient *c) {
f4f56e1d 4842 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4843}
4844
4845static void sdiffCommand(redisClient *c) {
4846 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4847}
4848
4849static void sdiffstoreCommand(redisClient *c) {
4850 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 4851}
4852
6b47e12e 4853/* ==================================== ZSets =============================== */
4854
4855/* ZSETs are ordered sets using two data structures to hold the same elements
4856 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4857 * data structure.
4858 *
4859 * The elements are added to an hash table mapping Redis objects to scores.
4860 * At the same time the elements are added to a skip list mapping scores
4861 * to Redis objects (so objects are sorted by scores in this "view"). */
4862
4863/* This skiplist implementation is almost a C translation of the original
4864 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4865 * Alternative to Balanced Trees", modified in three ways:
4866 * a) this implementation allows for repeated values.
4867 * b) the comparison is not just by key (our 'score') but by satellite data.
4868 * c) there is a back pointer, so it's a doubly linked list with the back
4869 * pointers being only at "level 1". This allows to traverse the list
4870 * from tail to head, useful for ZREVRANGE. */
4871
4872static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4873 zskiplistNode *zn = zmalloc(sizeof(*zn));
4874
4875 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
4876 if (level > 0)
4877 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 4878 zn->score = score;
4879 zn->obj = obj;
4880 return zn;
4881}
4882
4883static zskiplist *zslCreate(void) {
4884 int j;
4885 zskiplist *zsl;
4886
4887 zsl = zmalloc(sizeof(*zsl));
4888 zsl->level = 1;
cc812361 4889 zsl->length = 0;
6b47e12e 4890 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 4891 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 4892 zsl->header->forward[j] = NULL;
94e543b5 4893
4894 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4895 if (j < ZSKIPLIST_MAXLEVEL-1)
4896 zsl->header->span[j] = 0;
69d95c3e 4897 }
e3870fab 4898 zsl->header->backward = NULL;
4899 zsl->tail = NULL;
6b47e12e 4900 return zsl;
4901}
4902
fd8ccf44 4903static void zslFreeNode(zskiplistNode *node) {
4904 decrRefCount(node->obj);
ad807e6f 4905 zfree(node->forward);
69d95c3e 4906 zfree(node->span);
fd8ccf44 4907 zfree(node);
4908}
4909
4910static void zslFree(zskiplist *zsl) {
ad807e6f 4911 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 4912
ad807e6f 4913 zfree(zsl->header->forward);
69d95c3e 4914 zfree(zsl->header->span);
ad807e6f 4915 zfree(zsl->header);
fd8ccf44 4916 while(node) {
599379dd 4917 next = node->forward[0];
fd8ccf44 4918 zslFreeNode(node);
4919 node = next;
4920 }
ad807e6f 4921 zfree(zsl);
fd8ccf44 4922}
4923
6b47e12e 4924static int zslRandomLevel(void) {
4925 int level = 1;
4926 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4927 level += 1;
4928 return level;
4929}
4930
4931static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4932 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 4933 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 4934 int i, level;
4935
4936 x = zsl->header;
4937 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
4938 /* store rank that is crossed to reach the insert position */
4939 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 4940
9d60e6e4 4941 while (x->forward[i] &&
4942 (x->forward[i]->score < score ||
4943 (x->forward[i]->score == score &&
69d95c3e 4944 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 4945 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 4946 x = x->forward[i];
69d95c3e 4947 }
6b47e12e 4948 update[i] = x;
4949 }
6b47e12e 4950 /* we assume the key is not already inside, since we allow duplicated
4951 * scores, and the re-insertion of score and redis object should never
4952 * happpen since the caller of zslInsert() should test in the hash table
4953 * if the element is already inside or not. */
4954 level = zslRandomLevel();
4955 if (level > zsl->level) {
69d95c3e 4956 for (i = zsl->level; i < level; i++) {
2b37892e 4957 rank[i] = 0;
6b47e12e 4958 update[i] = zsl->header;
2b37892e 4959 update[i]->span[i-1] = zsl->length;
69d95c3e 4960 }
6b47e12e 4961 zsl->level = level;
4962 }
4963 x = zslCreateNode(level,score,obj);
4964 for (i = 0; i < level; i++) {
4965 x->forward[i] = update[i]->forward[i];
4966 update[i]->forward[i] = x;
69d95c3e
PN
4967
4968 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
4969 if (i > 0) {
4970 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
4971 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
4972 }
6b47e12e 4973 }
69d95c3e
PN
4974
4975 /* increment span for untouched levels */
4976 for (i = level; i < zsl->level; i++) {
2b37892e 4977 update[i]->span[i-1]++;
69d95c3e
PN
4978 }
4979
bb975144 4980 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 4981 if (x->forward[0])
4982 x->forward[0]->backward = x;
4983 else
4984 zsl->tail = x;
cc812361 4985 zsl->length++;
6b47e12e 4986}
4987
50c55df5 4988/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 4989static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 4990 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4991 int i;
4992
4993 x = zsl->header;
4994 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 4995 while (x->forward[i] &&
4996 (x->forward[i]->score < score ||
4997 (x->forward[i]->score == score &&
4998 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 4999 x = x->forward[i];
5000 update[i] = x;
5001 }
5002 /* We may have multiple elements with the same score, what we need
5003 * is to find the element with both the right score and object. */
5004 x = x->forward[0];
50c55df5 5005 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
9d60e6e4 5006 for (i = 0; i < zsl->level; i++) {
69d95c3e 5007 if (update[i]->forward[i] == x) {
2b37892e
PN
5008 if (i > 0) {
5009 update[i]->span[i-1] += x->span[i-1] - 1;
5010 }
69d95c3e
PN
5011 update[i]->forward[i] = x->forward[i];
5012 } else {
2b37892e
PN
5013 /* invariant: i > 0, because update[0]->forward[0]
5014 * is always equal to x */
5015 update[i]->span[i-1] -= 1;
69d95c3e 5016 }
9d60e6e4 5017 }
5018 if (x->forward[0]) {
709d0a1b 5019 x->forward[0]->backward = x->backward;
e197b441 5020 } else {
9d60e6e4 5021 zsl->tail = x->backward;
e197b441 5022 }
9d60e6e4 5023 zslFreeNode(x);
5024 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5025 zsl->level--;
5026 zsl->length--;
5027 return 1;
5028 } else {
5029 return 0; /* not found */
e197b441 5030 }
5031 return 0; /* not found */
fd8ccf44 5032}
5033
1807985b 5034/* Delete all the elements with score between min and max from the skiplist.
5035 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5036 * Note that this function takes the reference to the hash table view of the
5037 * sorted set, in order to remove the elements from the hash table too. */
5038static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
5039 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5040 unsigned long removed = 0;
5041 int i;
5042
5043 x = zsl->header;
5044 for (i = zsl->level-1; i >= 0; i--) {
5045 while (x->forward[i] && x->forward[i]->score < min)
5046 x = x->forward[i];
5047 update[i] = x;
5048 }
5049 /* We may have multiple elements with the same score, what we need
5050 * is to find the element with both the right score and object. */
5051 x = x->forward[0];
5052 while (x && x->score <= max) {
5053 zskiplistNode *next;
5054
5055 for (i = 0; i < zsl->level; i++) {
69d95c3e 5056 if (update[i]->forward[i] == x) {
2b37892e
PN
5057 if (i > 0) {
5058 update[i]->span[i-1] += x->span[i-1] - 1;
5059 }
69d95c3e
PN
5060 update[i]->forward[i] = x->forward[i];
5061 } else {
2b37892e
PN
5062 /* invariant: i > 0, because update[0]->forward[0]
5063 * is always equal to x */
5064 update[i]->span[i-1] -= 1;
69d95c3e 5065 }
1807985b 5066 }
5067 if (x->forward[0]) {
709d0a1b 5068 x->forward[0]->backward = x->backward;
1807985b 5069 } else {
5070 zsl->tail = x->backward;
5071 }
5072 next = x->forward[0];
5073 dictDelete(dict,x->obj);
5074 zslFreeNode(x);
5075 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5076 zsl->level--;
5077 zsl->length--;
5078 removed++;
5079 x = next;
5080 }
5081 return removed; /* not found */
5082}
5083
50c55df5 5084/* Find the first node having a score equal or greater than the specified one.
5085 * Returns NULL if there is no match. */
5086static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5087 zskiplistNode *x;
5088 int i;
5089
5090 x = zsl->header;
5091 for (i = zsl->level-1; i >= 0; i--) {
5092 while (x->forward[i] && x->forward[i]->score < score)
5093 x = x->forward[i];
5094 }
5095 /* We may have multiple elements with the same score, what we need
5096 * is to find the element with both the right score and object. */
5097 return x->forward[0];
5098}
5099
27b0ccca
PN
5100/* Find the rank for an element by both score and key.
5101 * Returns 0 when the element cannot be found, rank otherwise.
5102 * Note that the rank is 1-based due to the span of zsl->header to the
5103 * first element. */
5104static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5105 zskiplistNode *x;
5106 unsigned long rank = 0;
5107 int i;
5108
5109 x = zsl->header;
5110 for (i = zsl->level-1; i >= 0; i--) {
5111 while (x->forward[i] &&
5112 (x->forward[i]->score < score ||
5113 (x->forward[i]->score == score &&
5114 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5115 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5116 x = x->forward[i];
5117 }
5118
5119 /* x might be equal to zsl->header, so test if obj is non-NULL */
5120 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5121 return rank;
5122 }
5123 }
5124 return 0;
5125}
5126
e74825c2
PN
5127/* Finds an element by its rank. The rank argument needs to be 1-based. */
5128zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5129 zskiplistNode *x;
5130 unsigned long traversed = 0;
5131 int i;
5132
5133 x = zsl->header;
5134 for (i = zsl->level-1; i >= 0; i--) {
a50ea45c
PN
5135 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) <= rank) {
5136 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5137 x = x->forward[i];
5138 }
5139
5140 if (traversed == rank) {
5141 return x;
5142 }
5143 }
5144 return NULL;
5145}
5146
fd8ccf44 5147/* The actual Z-commands implementations */
5148
7db723ad 5149/* This generic command implements both ZADD and ZINCRBY.
e2665397 5150 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5151 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5152static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5153 robj *zsetobj;
5154 zset *zs;
5155 double *score;
5156
e2665397 5157 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5158 if (zsetobj == NULL) {
5159 zsetobj = createZsetObject();
e2665397 5160 dictAdd(c->db->dict,key,zsetobj);
5161 incrRefCount(key);
fd8ccf44 5162 } else {
5163 if (zsetobj->type != REDIS_ZSET) {
5164 addReply(c,shared.wrongtypeerr);
5165 return;
5166 }
5167 }
fd8ccf44 5168 zs = zsetobj->ptr;
e2665397 5169
7db723ad 5170 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5171 * needs to handle the two different conditions. It's all about setting
5172 * '*score', that is, the new score to set, to the right value. */
5173 score = zmalloc(sizeof(double));
5174 if (doincrement) {
5175 dictEntry *de;
5176
5177 /* Read the old score. If the element was not present starts from 0 */
5178 de = dictFind(zs->dict,ele);
5179 if (de) {
5180 double *oldscore = dictGetEntryVal(de);
5181 *score = *oldscore + scoreval;
5182 } else {
5183 *score = scoreval;
5184 }
5185 } else {
5186 *score = scoreval;
5187 }
5188
5189 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5190 * to both ZADD and ZINCRBY... */
e2665397 5191 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5192 /* case 1: New element */
e2665397 5193 incrRefCount(ele); /* added to hash */
5194 zslInsert(zs->zsl,*score,ele);
5195 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5196 server.dirty++;
e2665397 5197 if (doincrement)
e2665397 5198 addReplyDouble(c,*score);
91d71bfc 5199 else
5200 addReply(c,shared.cone);
fd8ccf44 5201 } else {
5202 dictEntry *de;
5203 double *oldscore;
5204
5205 /* case 2: Score update operation */
e2665397 5206 de = dictFind(zs->dict,ele);
dfc5e96c 5207 redisAssert(de != NULL);
fd8ccf44 5208 oldscore = dictGetEntryVal(de);
5209 if (*score != *oldscore) {
5210 int deleted;
5211
e2665397 5212 /* Remove and insert the element in the skip list with new score */
5213 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5214 redisAssert(deleted != 0);
e2665397 5215 zslInsert(zs->zsl,*score,ele);
5216 incrRefCount(ele);
5217 /* Update the score in the hash table */
5218 dictReplace(zs->dict,ele,score);
fd8ccf44 5219 server.dirty++;
2161a965 5220 } else {
5221 zfree(score);
fd8ccf44 5222 }
e2665397 5223 if (doincrement)
5224 addReplyDouble(c,*score);
5225 else
5226 addReply(c,shared.czero);
fd8ccf44 5227 }
5228}
5229
e2665397 5230static void zaddCommand(redisClient *c) {
5231 double scoreval;
5232
5233 scoreval = strtod(c->argv[2]->ptr,NULL);
5234 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5235}
5236
7db723ad 5237static void zincrbyCommand(redisClient *c) {
e2665397 5238 double scoreval;
5239
5240 scoreval = strtod(c->argv[2]->ptr,NULL);
5241 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5242}
5243
1b7106e7 5244static void zremCommand(redisClient *c) {
5245 robj *zsetobj;
5246 zset *zs;
5247
5248 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5249 if (zsetobj == NULL) {
5250 addReply(c,shared.czero);
5251 } else {
5252 dictEntry *de;
5253 double *oldscore;
5254 int deleted;
5255
5256 if (zsetobj->type != REDIS_ZSET) {
5257 addReply(c,shared.wrongtypeerr);
5258 return;
5259 }
5260 zs = zsetobj->ptr;
5261 de = dictFind(zs->dict,c->argv[2]);
5262 if (de == NULL) {
5263 addReply(c,shared.czero);
5264 return;
5265 }
5266 /* Delete from the skiplist */
5267 oldscore = dictGetEntryVal(de);
5268 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
dfc5e96c 5269 redisAssert(deleted != 0);
1b7106e7 5270
5271 /* Delete from the hash table */
5272 dictDelete(zs->dict,c->argv[2]);
5273 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5274 server.dirty++;
5275 addReply(c,shared.cone);
5276 }
5277}
5278
1807985b 5279static void zremrangebyscoreCommand(redisClient *c) {
5280 double min = strtod(c->argv[2]->ptr,NULL);
5281 double max = strtod(c->argv[3]->ptr,NULL);
5282 robj *zsetobj;
5283 zset *zs;
5284
5285 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5286 if (zsetobj == NULL) {
5287 addReply(c,shared.czero);
5288 } else {
5289 long deleted;
5290
5291 if (zsetobj->type != REDIS_ZSET) {
5292 addReply(c,shared.wrongtypeerr);
5293 return;
5294 }
5295 zs = zsetobj->ptr;
5296 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
5297 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5298 server.dirty += deleted;
5299 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5300 }
5301}
5302
e3870fab 5303static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5304 robj *o;
5305 int start = atoi(c->argv[2]->ptr);
5306 int end = atoi(c->argv[3]->ptr);
752da584 5307 int withscores = 0;
5308
5309 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5310 withscores = 1;
5311 } else if (c->argc >= 5) {
5312 addReply(c,shared.syntaxerr);
5313 return;
5314 }
cc812361 5315
5316 o = lookupKeyRead(c->db,c->argv[1]);
5317 if (o == NULL) {
5318 addReply(c,shared.nullmultibulk);
5319 } else {
5320 if (o->type != REDIS_ZSET) {
5321 addReply(c,shared.wrongtypeerr);
5322 } else {
5323 zset *zsetobj = o->ptr;
5324 zskiplist *zsl = zsetobj->zsl;
5325 zskiplistNode *ln;
5326
5327 int llen = zsl->length;
5328 int rangelen, j;
5329 robj *ele;
5330
5331 /* convert negative indexes */
5332 if (start < 0) start = llen+start;
5333 if (end < 0) end = llen+end;
5334 if (start < 0) start = 0;
5335 if (end < 0) end = 0;
5336
5337 /* indexes sanity checks */
5338 if (start > end || start >= llen) {
5339 /* Out of range start or start > end result in empty list */
5340 addReply(c,shared.emptymultibulk);
5341 return;
5342 }
5343 if (end >= llen) end = llen-1;
5344 rangelen = (end-start)+1;
5345
edb51958
PN
5346 /* check if starting point is trivial, before searching
5347 * the element in log(N) time */
e3870fab 5348 if (reverse) {
edb51958 5349 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen - start);
e3870fab 5350 } else {
edb51958 5351 ln = start == 0 ? zsl->header->forward[0] : zslGetElementByRank(zsl, start + 1);
e3870fab 5352 }
cc812361 5353
edb51958 5354 /* Return the result in form of a multi-bulk reply */
752da584 5355 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5356 withscores ? (rangelen*2) : rangelen));
cc812361 5357 for (j = 0; j < rangelen; j++) {
0aad7a19 5358 ele = ln->obj;
cc812361 5359 addReplyBulkLen(c,ele);
5360 addReply(c,ele);
5361 addReply(c,shared.crlf);
752da584 5362 if (withscores)
5363 addReplyDouble(c,ln->score);
e3870fab 5364 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5365 }
5366 }
5367 }
5368}
5369
e3870fab 5370static void zrangeCommand(redisClient *c) {
5371 zrangeGenericCommand(c,0);
5372}
5373
5374static void zrevrangeCommand(redisClient *c) {
5375 zrangeGenericCommand(c,1);
5376}
5377
f44dd428 5378/* This command implements both ZRANGEBYSCORE and ZCOUNT.
5379 * If justcount is non-zero, just the count is returned. */
5380static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 5381 robj *o;
f44dd428 5382 double min, max;
5383 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 5384 int offset = 0, limit = -1;
0500ef27
SH
5385 int withscores = 0;
5386 int badsyntax = 0;
5387
f44dd428 5388 /* Parse the min-max interval. If one of the values is prefixed
5389 * by the "(" character, it's considered "open". For instance
5390 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5391 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5392 if (((char*)c->argv[2]->ptr)[0] == '(') {
5393 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5394 minex = 1;
5395 } else {
5396 min = strtod(c->argv[2]->ptr,NULL);
5397 }
5398 if (((char*)c->argv[3]->ptr)[0] == '(') {
5399 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5400 maxex = 1;
5401 } else {
5402 max = strtod(c->argv[3]->ptr,NULL);
5403 }
5404
5405 /* Parse "WITHSCORES": note that if the command was called with
5406 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5407 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 5408 if (c->argc == 5 || c->argc == 8) {
3a3978b1 5409 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5410 withscores = 1;
5411 else
5412 badsyntax = 1;
0500ef27 5413 }
3a3978b1 5414 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 5415 badsyntax = 1;
0500ef27 5416 if (badsyntax) {
454d4e43 5417 addReplySds(c,
5418 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5419 return;
0500ef27
SH
5420 }
5421
f44dd428 5422 /* Parse "LIMIT" */
0500ef27 5423 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 5424 addReply(c,shared.syntaxerr);
5425 return;
0500ef27 5426 } else if (c->argc == (7 + withscores)) {
80181f78 5427 offset = atoi(c->argv[5]->ptr);
5428 limit = atoi(c->argv[6]->ptr);
0b13687c 5429 if (offset < 0) offset = 0;
80181f78 5430 }
50c55df5 5431
f44dd428 5432 /* Ok, lookup the key and get the range */
50c55df5 5433 o = lookupKeyRead(c->db,c->argv[1]);
5434 if (o == NULL) {
f44dd428 5435 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
50c55df5 5436 } else {
5437 if (o->type != REDIS_ZSET) {
5438 addReply(c,shared.wrongtypeerr);
5439 } else {
5440 zset *zsetobj = o->ptr;
5441 zskiplist *zsl = zsetobj->zsl;
5442 zskiplistNode *ln;
f44dd428 5443 robj *ele, *lenobj = NULL;
5444 unsigned long rangelen = 0;
50c55df5 5445
f44dd428 5446 /* Get the first node with the score >= min, or with
5447 * score > min if 'minex' is true. */
50c55df5 5448 ln = zslFirstWithScore(zsl,min);
f44dd428 5449 while (minex && ln && ln->score == min) ln = ln->forward[0];
5450
50c55df5 5451 if (ln == NULL) {
5452 /* No element matching the speciifed interval */
f44dd428 5453 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 5454 return;
5455 }
5456
5457 /* We don't know in advance how many matching elements there
5458 * are in the list, so we push this object that will represent
5459 * the multi-bulk length in the output buffer, and will "fix"
5460 * it later */
f44dd428 5461 if (!justcount) {
5462 lenobj = createObject(REDIS_STRING,NULL);
5463 addReply(c,lenobj);
5464 decrRefCount(lenobj);
5465 }
50c55df5 5466
f44dd428 5467 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 5468 if (offset) {
5469 offset--;
5470 ln = ln->forward[0];
5471 continue;
5472 }
5473 if (limit == 0) break;
f44dd428 5474 if (!justcount) {
5475 ele = ln->obj;
5476 addReplyBulkLen(c,ele);
5477 addReply(c,ele);
5478 addReply(c,shared.crlf);
5479 if (withscores)
5480 addReplyDouble(c,ln->score);
5481 }
50c55df5 5482 ln = ln->forward[0];
5483 rangelen++;
80181f78 5484 if (limit > 0) limit--;
50c55df5 5485 }
f44dd428 5486 if (justcount) {
5487 addReplyLong(c,(long)rangelen);
5488 } else {
5489 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5490 withscores ? (rangelen*2) : rangelen);
5491 }
50c55df5 5492 }
5493 }
5494}
5495
f44dd428 5496static void zrangebyscoreCommand(redisClient *c) {
5497 genericZrangebyscoreCommand(c,0);
5498}
5499
5500static void zcountCommand(redisClient *c) {
5501 genericZrangebyscoreCommand(c,1);
5502}
5503
3c41331e 5504static void zcardCommand(redisClient *c) {
e197b441 5505 robj *o;
5506 zset *zs;
5507
5508 o = lookupKeyRead(c->db,c->argv[1]);
5509 if (o == NULL) {
5510 addReply(c,shared.czero);
5511 return;
5512 } else {
5513 if (o->type != REDIS_ZSET) {
5514 addReply(c,shared.wrongtypeerr);
5515 } else {
5516 zs = o->ptr;
682ac724 5517 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
e197b441 5518 }
5519 }
5520}
5521
6e333bbe 5522static void zscoreCommand(redisClient *c) {
5523 robj *o;
5524 zset *zs;
5525
5526 o = lookupKeyRead(c->db,c->argv[1]);
5527 if (o == NULL) {
96d8b4ee 5528 addReply(c,shared.nullbulk);
6e333bbe 5529 return;
5530 } else {
5531 if (o->type != REDIS_ZSET) {
5532 addReply(c,shared.wrongtypeerr);
5533 } else {
5534 dictEntry *de;
5535
5536 zs = o->ptr;
5537 de = dictFind(zs->dict,c->argv[2]);
5538 if (!de) {
5539 addReply(c,shared.nullbulk);
5540 } else {
6e333bbe 5541 double *score = dictGetEntryVal(de);
5542
e2665397 5543 addReplyDouble(c,*score);
6e333bbe 5544 }
5545 }
5546 }
5547}
5548
69d95c3e
PN
5549static void zrankCommand(redisClient *c) {
5550 robj *o;
5551 o = lookupKeyRead(c->db,c->argv[1]);
5552 if (o == NULL) {
5553 addReply(c,shared.nullbulk);
5554 return;
5555 }
5556 if (o->type != REDIS_ZSET) {
5557 addReply(c,shared.wrongtypeerr);
27b0ccca
PN
5558 } else {
5559 zset *zs = o->ptr;
5560 zskiplist *zsl = zs->zsl;
5561 dictEntry *de;
5562 unsigned long rank;
69d95c3e 5563
27b0ccca
PN
5564 de = dictFind(zs->dict,c->argv[2]);
5565 if (!de) {
5566 addReply(c,shared.nullbulk);
5567 return;
69d95c3e
PN
5568 }
5569
27b0ccca
PN
5570 double *score = dictGetEntryVal(de);
5571 rank = zslGetRank(zsl, *score, c->argv[2]);
5572 if (rank) {
67cac143 5573 addReplyLong(c, rank-1);
27b0ccca
PN
5574 } else {
5575 addReply(c,shared.nullbulk);
69d95c3e 5576 }
978c2c94 5577 }
5578}
5579
5580/* ==================================== Hash ================================ */
5581static void hsetCommand(redisClient *c) {
5582 int update = 0;
5583 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5584
5585 if (o == NULL) {
5586 o = createHashObject();
5587 dictAdd(c->db->dict,c->argv[1],o);
5588 incrRefCount(c->argv[1]);
5589 } else {
5590 if (o->type != REDIS_HASH) {
5591 addReply(c,shared.wrongtypeerr);
5592 return;
5593 }
5594 }
5595 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5596 unsigned char *zm = o->ptr;
5597
5598 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5599 c->argv[3]->ptr,sdslen(c->argv[3]->ptr),&update);
5600 } else {
5601 if (dictAdd(o->ptr,c->argv[2],c->argv[3]) == DICT_OK) {
5602 incrRefCount(c->argv[2]);
5603 } else {
5604 update = 1;
5605 }
5606 incrRefCount(c->argv[3]);
5607 }
5608 server.dirty++;
5609 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5610}
5611
5612static void hgetCommand(redisClient *c) {
5613 robj *o = lookupKeyRead(c->db,c->argv[1]);
5614
5615 if (o == NULL) {
5616 addReply(c,shared.nullbulk);
5617 return;
5618 } else {
5619 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5620 unsigned char *zm = o->ptr;
5621 unsigned char *val;
5622 unsigned int vlen;
5623
5624 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr), &val,&vlen)) {
5625 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
5626 addReplySds(c,sdsnewlen(val,vlen));
5627 addReply(c,shared.crlf);
5628 return;
5629 } else {
5630 addReply(c,shared.nullbulk);
5631 return;
5632 }
5633 } else {
5634 struct dictEntry *de;
5635
5636 de = dictFind(o->ptr,c->argv[2]);
5637 if (de == NULL) {
5638 addReply(c,shared.nullbulk);
5639 } else {
5640 robj *e = dictGetEntryVal(de);
5641
5642 addReplyBulkLen(c,e);
5643 addReply(c,e);
5644 addReply(c,shared.crlf);
5645 }
5646 }
69d95c3e 5647 }
69d95c3e
PN
5648}
5649
6b47e12e 5650/* ========================= Non type-specific commands ==================== */
5651
ed9b544e 5652static void flushdbCommand(redisClient *c) {
ca37e9cd 5653 server.dirty += dictSize(c->db->dict);
3305306f 5654 dictEmpty(c->db->dict);
5655 dictEmpty(c->db->expires);
ed9b544e 5656 addReply(c,shared.ok);
ed9b544e 5657}
5658
5659static void flushallCommand(redisClient *c) {
ca37e9cd 5660 server.dirty += emptyDb();
ed9b544e 5661 addReply(c,shared.ok);
f78fd11b 5662 rdbSave(server.dbfilename);
ca37e9cd 5663 server.dirty++;
ed9b544e 5664}
5665
56906eef 5666static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 5667 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 5668 so->type = type;
5669 so->pattern = pattern;
5670 return so;
5671}
5672
5673/* Return the value associated to the key with a name obtained
5674 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 5675static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 5676 char *p;
5677 sds spat, ssub;
5678 robj keyobj;
5679 int prefixlen, sublen, postfixlen;
ed9b544e 5680 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5681 struct {
f1017b3f 5682 long len;
5683 long free;
ed9b544e 5684 char buf[REDIS_SORTKEY_MAX+1];
5685 } keyname;
5686
28173a49 5687 /* If the pattern is "#" return the substitution object itself in order
5688 * to implement the "SORT ... GET #" feature. */
5689 spat = pattern->ptr;
5690 if (spat[0] == '#' && spat[1] == '\0') {
5691 return subst;
5692 }
5693
5694 /* The substitution object may be specially encoded. If so we create
9d65a1bb 5695 * a decoded object on the fly. Otherwise getDecodedObject will just
5696 * increment the ref count, that we'll decrement later. */
5697 subst = getDecodedObject(subst);
942a3961 5698
ed9b544e 5699 ssub = subst->ptr;
5700 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5701 p = strchr(spat,'*');
ed5a857a 5702 if (!p) {
5703 decrRefCount(subst);
5704 return NULL;
5705 }
ed9b544e 5706
5707 prefixlen = p-spat;
5708 sublen = sdslen(ssub);
5709 postfixlen = sdslen(spat)-(prefixlen+1);
5710 memcpy(keyname.buf,spat,prefixlen);
5711 memcpy(keyname.buf+prefixlen,ssub,sublen);
5712 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5713 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5714 keyname.len = prefixlen+sublen+postfixlen;
5715
dfc5e96c 5716 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 5717 decrRefCount(subst);
5718
a4d1ba9a 5719 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 5720 return lookupKeyRead(db,&keyobj);
ed9b544e 5721}
5722
5723/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5724 * the additional parameter is not standard but a BSD-specific we have to
5725 * pass sorting parameters via the global 'server' structure */
5726static int sortCompare(const void *s1, const void *s2) {
5727 const redisSortObject *so1 = s1, *so2 = s2;
5728 int cmp;
5729
5730 if (!server.sort_alpha) {
5731 /* Numeric sorting. Here it's trivial as we precomputed scores */
5732 if (so1->u.score > so2->u.score) {
5733 cmp = 1;
5734 } else if (so1->u.score < so2->u.score) {
5735 cmp = -1;
5736 } else {
5737 cmp = 0;
5738 }
5739 } else {
5740 /* Alphanumeric sorting */
5741 if (server.sort_bypattern) {
5742 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5743 /* At least one compare object is NULL */
5744 if (so1->u.cmpobj == so2->u.cmpobj)
5745 cmp = 0;
5746 else if (so1->u.cmpobj == NULL)
5747 cmp = -1;
5748 else
5749 cmp = 1;
5750 } else {
5751 /* We have both the objects, use strcoll */
5752 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5753 }
5754 } else {
5755 /* Compare elements directly */
9d65a1bb 5756 robj *dec1, *dec2;
5757
5758 dec1 = getDecodedObject(so1->obj);
5759 dec2 = getDecodedObject(so2->obj);
5760 cmp = strcoll(dec1->ptr,dec2->ptr);
5761 decrRefCount(dec1);
5762 decrRefCount(dec2);
ed9b544e 5763 }
5764 }
5765 return server.sort_desc ? -cmp : cmp;
5766}
5767
5768/* The SORT command is the most complex command in Redis. Warning: this code
5769 * is optimized for speed and a bit less for readability */
5770static void sortCommand(redisClient *c) {
ed9b544e 5771 list *operations;
5772 int outputlen = 0;
5773 int desc = 0, alpha = 0;
5774 int limit_start = 0, limit_count = -1, start, end;
5775 int j, dontsort = 0, vectorlen;
5776 int getop = 0; /* GET operation counter */
443c6409 5777 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 5778 redisSortObject *vector; /* Resulting vector to sort */
5779
5780 /* Lookup the key to sort. It must be of the right types */
3305306f 5781 sortval = lookupKeyRead(c->db,c->argv[1]);
5782 if (sortval == NULL) {
d922ae65 5783 addReply(c,shared.nullmultibulk);
ed9b544e 5784 return;
5785 }
a5eb649b 5786 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5787 sortval->type != REDIS_ZSET)
5788 {
c937aa89 5789 addReply(c,shared.wrongtypeerr);
ed9b544e 5790 return;
5791 }
5792
5793 /* Create a list of operations to perform for every sorted element.
5794 * Operations can be GET/DEL/INCR/DECR */
5795 operations = listCreate();
092dac2a 5796 listSetFreeMethod(operations,zfree);
ed9b544e 5797 j = 2;
5798
5799 /* Now we need to protect sortval incrementing its count, in the future
5800 * SORT may have options able to overwrite/delete keys during the sorting
5801 * and the sorted key itself may get destroied */
5802 incrRefCount(sortval);
5803
5804 /* The SORT command has an SQL-alike syntax, parse it */
5805 while(j < c->argc) {
5806 int leftargs = c->argc-j-1;
5807 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5808 desc = 0;
5809 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5810 desc = 1;
5811 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5812 alpha = 1;
5813 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5814 limit_start = atoi(c->argv[j+1]->ptr);
5815 limit_count = atoi(c->argv[j+2]->ptr);
5816 j+=2;
443c6409 5817 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5818 storekey = c->argv[j+1];
5819 j++;
ed9b544e 5820 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5821 sortby = c->argv[j+1];
5822 /* If the BY pattern does not contain '*', i.e. it is constant,
5823 * we don't need to sort nor to lookup the weight keys. */
5824 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5825 j++;
5826 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5827 listAddNodeTail(operations,createSortOperation(
5828 REDIS_SORT_GET,c->argv[j+1]));
5829 getop++;
5830 j++;
ed9b544e 5831 } else {
5832 decrRefCount(sortval);
5833 listRelease(operations);
c937aa89 5834 addReply(c,shared.syntaxerr);
ed9b544e 5835 return;
5836 }
5837 j++;
5838 }
5839
5840 /* Load the sorting vector with all the objects to sort */
a5eb649b 5841 switch(sortval->type) {
5842 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5843 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5844 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 5845 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 5846 }
ed9b544e 5847 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 5848 j = 0;
a5eb649b 5849
ed9b544e 5850 if (sortval->type == REDIS_LIST) {
5851 list *list = sortval->ptr;
6208b3a7 5852 listNode *ln;
c7df85a4 5853 listIter li;
6208b3a7 5854
c7df85a4 5855 listRewind(list,&li);
5856 while((ln = listNext(&li))) {
ed9b544e 5857 robj *ele = ln->value;
5858 vector[j].obj = ele;
5859 vector[j].u.score = 0;
5860 vector[j].u.cmpobj = NULL;
ed9b544e 5861 j++;
5862 }
5863 } else {
a5eb649b 5864 dict *set;
ed9b544e 5865 dictIterator *di;
5866 dictEntry *setele;
5867
a5eb649b 5868 if (sortval->type == REDIS_SET) {
5869 set = sortval->ptr;
5870 } else {
5871 zset *zs = sortval->ptr;
5872 set = zs->dict;
5873 }
5874
ed9b544e 5875 di = dictGetIterator(set);
ed9b544e 5876 while((setele = dictNext(di)) != NULL) {
5877 vector[j].obj = dictGetEntryKey(setele);
5878 vector[j].u.score = 0;
5879 vector[j].u.cmpobj = NULL;
5880 j++;
5881 }
5882 dictReleaseIterator(di);
5883 }
dfc5e96c 5884 redisAssert(j == vectorlen);
ed9b544e 5885
5886 /* Now it's time to load the right scores in the sorting vector */
5887 if (dontsort == 0) {
5888 for (j = 0; j < vectorlen; j++) {
5889 if (sortby) {
5890 robj *byval;
5891
3305306f 5892 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 5893 if (!byval || byval->type != REDIS_STRING) continue;
5894 if (alpha) {
9d65a1bb 5895 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 5896 } else {
942a3961 5897 if (byval->encoding == REDIS_ENCODING_RAW) {
5898 vector[j].u.score = strtod(byval->ptr,NULL);
5899 } else {
9d65a1bb 5900 /* Don't need to decode the object if it's
5901 * integer-encoded (the only encoding supported) so
5902 * far. We can just cast it */
f1017b3f 5903 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 5904 vector[j].u.score = (long)byval->ptr;
f1017b3f 5905 } else
dfc5e96c 5906 redisAssert(1 != 1);
942a3961 5907 }
ed9b544e 5908 }
5909 } else {
942a3961 5910 if (!alpha) {
5911 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
5912 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
5913 else {
5914 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
5915 vector[j].u.score = (long) vector[j].obj->ptr;
5916 else
dfc5e96c 5917 redisAssert(1 != 1);
942a3961 5918 }
5919 }
ed9b544e 5920 }
5921 }
5922 }
5923
5924 /* We are ready to sort the vector... perform a bit of sanity check
5925 * on the LIMIT option too. We'll use a partial version of quicksort. */
5926 start = (limit_start < 0) ? 0 : limit_start;
5927 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
5928 if (start >= vectorlen) {
5929 start = vectorlen-1;
5930 end = vectorlen-2;
5931 }
5932 if (end >= vectorlen) end = vectorlen-1;
5933
5934 if (dontsort == 0) {
5935 server.sort_desc = desc;
5936 server.sort_alpha = alpha;
5937 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 5938 if (sortby && (start != 0 || end != vectorlen-1))
5939 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
5940 else
5941 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 5942 }
5943
5944 /* Send command output to the output buffer, performing the specified
5945 * GET/DEL/INCR/DECR operations if any. */
5946 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 5947 if (storekey == NULL) {
5948 /* STORE option not specified, sent the sorting result to client */
5949 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
5950 for (j = start; j <= end; j++) {
5951 listNode *ln;
c7df85a4 5952 listIter li;
5953
443c6409 5954 if (!getop) {
5955 addReplyBulkLen(c,vector[j].obj);
5956 addReply(c,vector[j].obj);
5957 addReply(c,shared.crlf);
5958 }
c7df85a4 5959 listRewind(operations,&li);
5960 while((ln = listNext(&li))) {
443c6409 5961 redisSortOperation *sop = ln->value;
5962 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5963 vector[j].obj);
5964
5965 if (sop->type == REDIS_SORT_GET) {
5966 if (!val || val->type != REDIS_STRING) {
5967 addReply(c,shared.nullbulk);
5968 } else {
5969 addReplyBulkLen(c,val);
5970 addReply(c,val);
5971 addReply(c,shared.crlf);
5972 }
5973 } else {
dfc5e96c 5974 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 5975 }
5976 }
ed9b544e 5977 }
443c6409 5978 } else {
5979 robj *listObject = createListObject();
5980 list *listPtr = (list*) listObject->ptr;
5981
5982 /* STORE option specified, set the sorting result as a List object */
5983 for (j = start; j <= end; j++) {
5984 listNode *ln;
c7df85a4 5985 listIter li;
5986
443c6409 5987 if (!getop) {
5988 listAddNodeTail(listPtr,vector[j].obj);
5989 incrRefCount(vector[j].obj);
5990 }
c7df85a4 5991 listRewind(operations,&li);
5992 while((ln = listNext(&li))) {
443c6409 5993 redisSortOperation *sop = ln->value;
5994 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5995 vector[j].obj);
5996
5997 if (sop->type == REDIS_SORT_GET) {
5998 if (!val || val->type != REDIS_STRING) {
5999 listAddNodeTail(listPtr,createStringObject("",0));
6000 } else {
6001 listAddNodeTail(listPtr,val);
6002 incrRefCount(val);
6003 }
ed9b544e 6004 } else {
dfc5e96c 6005 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 6006 }
ed9b544e 6007 }
ed9b544e 6008 }
121796f7 6009 if (dictReplace(c->db->dict,storekey,listObject)) {
6010 incrRefCount(storekey);
6011 }
443c6409 6012 /* Note: we add 1 because the DB is dirty anyway since even if the
6013 * SORT result is empty a new key is set and maybe the old content
6014 * replaced. */
6015 server.dirty += 1+outputlen;
6016 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 6017 }
6018
6019 /* Cleanup */
6020 decrRefCount(sortval);
6021 listRelease(operations);
6022 for (j = 0; j < vectorlen; j++) {
6023 if (sortby && alpha && vector[j].u.cmpobj)
6024 decrRefCount(vector[j].u.cmpobj);
6025 }
6026 zfree(vector);
6027}
6028
ec6c7a1d 6029/* Convert an amount of bytes into a human readable string in the form
6030 * of 100B, 2G, 100M, 4K, and so forth. */
6031static void bytesToHuman(char *s, unsigned long long n) {
6032 double d;
6033
6034 if (n < 1024) {
6035 /* Bytes */
6036 sprintf(s,"%lluB",n);
6037 return;
6038 } else if (n < (1024*1024)) {
6039 d = (double)n/(1024);
6040 sprintf(s,"%.2fK",d);
6041 } else if (n < (1024LL*1024*1024)) {
6042 d = (double)n/(1024*1024);
6043 sprintf(s,"%.2fM",d);
6044 } else if (n < (1024LL*1024*1024*1024)) {
6045 d = (double)n/(1024LL*1024*1024);
b72f6a4b 6046 sprintf(s,"%.2fG",d);
ec6c7a1d 6047 }
6048}
6049
1c85b79f 6050/* Create the string returned by the INFO command. This is decoupled
6051 * by the INFO command itself as we need to report the same information
6052 * on memory corruption problems. */
6053static sds genRedisInfoString(void) {
ed9b544e 6054 sds info;
6055 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 6056 int j;
ec6c7a1d 6057 char hmem[64];
6058
b72f6a4b 6059 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 6060 info = sdscatprintf(sdsempty(),
6061 "redis_version:%s\r\n"
f1017b3f 6062 "arch_bits:%s\r\n"
7a932b74 6063 "multiplexing_api:%s\r\n"
0d7170a4 6064 "process_id:%ld\r\n"
682ac724 6065 "uptime_in_seconds:%ld\r\n"
6066 "uptime_in_days:%ld\r\n"
ed9b544e 6067 "connected_clients:%d\r\n"
6068 "connected_slaves:%d\r\n"
f86a74e9 6069 "blocked_clients:%d\r\n"
5fba9f71 6070 "used_memory:%zu\r\n"
ec6c7a1d 6071 "used_memory_human:%s\r\n"
ed9b544e 6072 "changes_since_last_save:%lld\r\n"
be2bb6b0 6073 "bgsave_in_progress:%d\r\n"
682ac724 6074 "last_save_time:%ld\r\n"
b3fad521 6075 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 6076 "total_connections_received:%lld\r\n"
6077 "total_commands_processed:%lld\r\n"
7d98e08c 6078 "vm_enabled:%d\r\n"
a0f643ea 6079 "role:%s\r\n"
ed9b544e 6080 ,REDIS_VERSION,
f1017b3f 6081 (sizeof(long) == 8) ? "64" : "32",
7a932b74 6082 aeGetApiName(),
0d7170a4 6083 (long) getpid(),
a0f643ea 6084 uptime,
6085 uptime/(3600*24),
ed9b544e 6086 listLength(server.clients)-listLength(server.slaves),
6087 listLength(server.slaves),
d5d55fc3 6088 server.blpop_blocked_clients,
b72f6a4b 6089 zmalloc_used_memory(),
ec6c7a1d 6090 hmem,
ed9b544e 6091 server.dirty,
9d65a1bb 6092 server.bgsavechildpid != -1,
ed9b544e 6093 server.lastsave,
b3fad521 6094 server.bgrewritechildpid != -1,
ed9b544e 6095 server.stat_numconnections,
6096 server.stat_numcommands,
7d98e08c 6097 server.vm_enabled != 0,
a0f643ea 6098 server.masterhost == NULL ? "master" : "slave"
ed9b544e 6099 );
a0f643ea 6100 if (server.masterhost) {
6101 info = sdscatprintf(info,
6102 "master_host:%s\r\n"
6103 "master_port:%d\r\n"
6104 "master_link_status:%s\r\n"
6105 "master_last_io_seconds_ago:%d\r\n"
6106 ,server.masterhost,
6107 server.masterport,
6108 (server.replstate == REDIS_REPL_CONNECTED) ?
6109 "up" : "down",
f72b934d 6110 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 6111 );
6112 }
7d98e08c 6113 if (server.vm_enabled) {
1064ef87 6114 lockThreadedIO();
7d98e08c 6115 info = sdscatprintf(info,
6116 "vm_conf_max_memory:%llu\r\n"
6117 "vm_conf_page_size:%llu\r\n"
6118 "vm_conf_pages:%llu\r\n"
6119 "vm_stats_used_pages:%llu\r\n"
6120 "vm_stats_swapped_objects:%llu\r\n"
6121 "vm_stats_swappin_count:%llu\r\n"
6122 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 6123 "vm_stats_io_newjobs_len:%lu\r\n"
6124 "vm_stats_io_processing_len:%lu\r\n"
6125 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 6126 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 6127 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 6128 ,(unsigned long long) server.vm_max_memory,
6129 (unsigned long long) server.vm_page_size,
6130 (unsigned long long) server.vm_pages,
6131 (unsigned long long) server.vm_stats_used_pages,
6132 (unsigned long long) server.vm_stats_swapped_objects,
6133 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 6134 (unsigned long long) server.vm_stats_swapouts,
6135 (unsigned long) listLength(server.io_newjobs),
6136 (unsigned long) listLength(server.io_processing),
6137 (unsigned long) listLength(server.io_processed),
d5d55fc3 6138 (unsigned long) server.io_active_threads,
6139 (unsigned long) server.vm_blocked_clients
7d98e08c 6140 );
1064ef87 6141 unlockThreadedIO();
7d98e08c 6142 }
c3cb078d 6143 for (j = 0; j < server.dbnum; j++) {
6144 long long keys, vkeys;
6145
6146 keys = dictSize(server.db[j].dict);
6147 vkeys = dictSize(server.db[j].expires);
6148 if (keys || vkeys) {
9d65a1bb 6149 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 6150 j, keys, vkeys);
6151 }
6152 }
1c85b79f 6153 return info;
6154}
6155
6156static void infoCommand(redisClient *c) {
6157 sds info = genRedisInfoString();
83c6a618 6158 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6159 (unsigned long)sdslen(info)));
ed9b544e 6160 addReplySds(c,info);
70003d28 6161 addReply(c,shared.crlf);
ed9b544e 6162}
6163
3305306f 6164static void monitorCommand(redisClient *c) {
6165 /* ignore MONITOR if aleady slave or in monitor mode */
6166 if (c->flags & REDIS_SLAVE) return;
6167
6168 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6169 c->slaveseldb = 0;
6b47e12e 6170 listAddNodeTail(server.monitors,c);
3305306f 6171 addReply(c,shared.ok);
6172}
6173
6174/* ================================= Expire ================================= */
6175static int removeExpire(redisDb *db, robj *key) {
6176 if (dictDelete(db->expires,key) == DICT_OK) {
6177 return 1;
6178 } else {
6179 return 0;
6180 }
6181}
6182
6183static int setExpire(redisDb *db, robj *key, time_t when) {
6184 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6185 return 0;
6186 } else {
6187 incrRefCount(key);
6188 return 1;
6189 }
6190}
6191
bb32ede5 6192/* Return the expire time of the specified key, or -1 if no expire
6193 * is associated with this key (i.e. the key is non volatile) */
6194static time_t getExpire(redisDb *db, robj *key) {
6195 dictEntry *de;
6196
6197 /* No expire? return ASAP */
6198 if (dictSize(db->expires) == 0 ||
6199 (de = dictFind(db->expires,key)) == NULL) return -1;
6200
6201 return (time_t) dictGetEntryVal(de);
6202}
6203
3305306f 6204static int expireIfNeeded(redisDb *db, robj *key) {
6205 time_t when;
6206 dictEntry *de;
6207
6208 /* No expire? return ASAP */
6209 if (dictSize(db->expires) == 0 ||
6210 (de = dictFind(db->expires,key)) == NULL) return 0;
6211
6212 /* Lookup the expire */
6213 when = (time_t) dictGetEntryVal(de);
6214 if (time(NULL) <= when) return 0;
6215
6216 /* Delete the key */
6217 dictDelete(db->expires,key);
6218 return dictDelete(db->dict,key) == DICT_OK;
6219}
6220
6221static int deleteIfVolatile(redisDb *db, robj *key) {
6222 dictEntry *de;
6223
6224 /* No expire? return ASAP */
6225 if (dictSize(db->expires) == 0 ||
6226 (de = dictFind(db->expires,key)) == NULL) return 0;
6227
6228 /* Delete the key */
0c66a471 6229 server.dirty++;
3305306f 6230 dictDelete(db->expires,key);
6231 return dictDelete(db->dict,key) == DICT_OK;
6232}
6233
802e8373 6234static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
3305306f 6235 dictEntry *de;
3305306f 6236
802e8373 6237 de = dictFind(c->db->dict,key);
3305306f 6238 if (de == NULL) {
6239 addReply(c,shared.czero);
6240 return;
6241 }
43e5ccdf 6242 if (seconds < 0) {
6243 if (deleteKey(c->db,key)) server.dirty++;
6244 addReply(c, shared.cone);
3305306f 6245 return;
6246 } else {
6247 time_t when = time(NULL)+seconds;
802e8373 6248 if (setExpire(c->db,key,when)) {
3305306f 6249 addReply(c,shared.cone);
77423026 6250 server.dirty++;
6251 } else {
3305306f 6252 addReply(c,shared.czero);
77423026 6253 }
3305306f 6254 return;
6255 }
6256}
6257
802e8373 6258static void expireCommand(redisClient *c) {
6259 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6260}
6261
6262static void expireatCommand(redisClient *c) {
6263 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6264}
6265
fd88489a 6266static void ttlCommand(redisClient *c) {
6267 time_t expire;
6268 int ttl = -1;
6269
6270 expire = getExpire(c->db,c->argv[1]);
6271 if (expire != -1) {
6272 ttl = (int) (expire-time(NULL));
6273 if (ttl < 0) ttl = -1;
6274 }
6275 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6276}
6277
6e469882 6278/* ================================ MULTI/EXEC ============================== */
6279
6280/* Client state initialization for MULTI/EXEC */
6281static void initClientMultiState(redisClient *c) {
6282 c->mstate.commands = NULL;
6283 c->mstate.count = 0;
6284}
6285
6286/* Release all the resources associated with MULTI/EXEC state */
6287static void freeClientMultiState(redisClient *c) {
6288 int j;
6289
6290 for (j = 0; j < c->mstate.count; j++) {
6291 int i;
6292 multiCmd *mc = c->mstate.commands+j;
6293
6294 for (i = 0; i < mc->argc; i++)
6295 decrRefCount(mc->argv[i]);
6296 zfree(mc->argv);
6297 }
6298 zfree(c->mstate.commands);
6299}
6300
6301/* Add a new command into the MULTI commands queue */
6302static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6303 multiCmd *mc;
6304 int j;
6305
6306 c->mstate.commands = zrealloc(c->mstate.commands,
6307 sizeof(multiCmd)*(c->mstate.count+1));
6308 mc = c->mstate.commands+c->mstate.count;
6309 mc->cmd = cmd;
6310 mc->argc = c->argc;
6311 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6312 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6313 for (j = 0; j < c->argc; j++)
6314 incrRefCount(mc->argv[j]);
6315 c->mstate.count++;
6316}
6317
6318static void multiCommand(redisClient *c) {
6319 c->flags |= REDIS_MULTI;
36c548f0 6320 addReply(c,shared.ok);
6e469882 6321}
6322
18b6cb76
DJ
6323static void discardCommand(redisClient *c) {
6324 if (!(c->flags & REDIS_MULTI)) {
6325 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6326 return;
6327 }
6328
6329 freeClientMultiState(c);
6330 initClientMultiState(c);
6331 c->flags &= (~REDIS_MULTI);
6332 addReply(c,shared.ok);
6333}
6334
6e469882 6335static void execCommand(redisClient *c) {
6336 int j;
6337 robj **orig_argv;
6338 int orig_argc;
6339
6340 if (!(c->flags & REDIS_MULTI)) {
6341 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6342 return;
6343 }
6344
6345 orig_argv = c->argv;
6346 orig_argc = c->argc;
6347 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6348 for (j = 0; j < c->mstate.count; j++) {
6349 c->argc = c->mstate.commands[j].argc;
6350 c->argv = c->mstate.commands[j].argv;
6351 call(c,c->mstate.commands[j].cmd);
6352 }
6353 c->argv = orig_argv;
6354 c->argc = orig_argc;
6355 freeClientMultiState(c);
6356 initClientMultiState(c);
6357 c->flags &= (~REDIS_MULTI);
6358}
6359
4409877e 6360/* =========================== Blocking Operations ========================= */
6361
6362/* Currently Redis blocking operations support is limited to list POP ops,
6363 * so the current implementation is not fully generic, but it is also not
6364 * completely specific so it will not require a rewrite to support new
6365 * kind of blocking operations in the future.
6366 *
6367 * Still it's important to note that list blocking operations can be already
6368 * used as a notification mechanism in order to implement other blocking
6369 * operations at application level, so there must be a very strong evidence
6370 * of usefulness and generality before new blocking operations are implemented.
6371 *
6372 * This is how the current blocking POP works, we use BLPOP as example:
6373 * - If the user calls BLPOP and the key exists and contains a non empty list
6374 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6375 * if there is not to block.
6376 * - If instead BLPOP is called and the key does not exists or the list is
6377 * empty we need to block. In order to do so we remove the notification for
6378 * new data to read in the client socket (so that we'll not serve new
6379 * requests if the blocking request is not served). Also we put the client
95242ab5 6380 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 6381 * blocking for this keys.
6382 * - If a PUSH operation against a key with blocked clients waiting is
6383 * performed, we serve the first in the list: basically instead to push
6384 * the new element inside the list we return it to the (first / oldest)
6385 * blocking client, unblock the client, and remove it form the list.
6386 *
6387 * The above comment and the source code should be enough in order to understand
6388 * the implementation and modify / fix it later.
6389 */
6390
6391/* Set a client in blocking mode for the specified key, with the specified
6392 * timeout */
b177fd30 6393static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 6394 dictEntry *de;
6395 list *l;
b177fd30 6396 int j;
4409877e 6397
b177fd30 6398 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6399 c->blockingkeysnum = numkeys;
4409877e 6400 c->blockingto = timeout;
b177fd30 6401 for (j = 0; j < numkeys; j++) {
6402 /* Add the key in the client structure, to map clients -> keys */
6403 c->blockingkeys[j] = keys[j];
6404 incrRefCount(keys[j]);
4409877e 6405
b177fd30 6406 /* And in the other "side", to map keys -> clients */
6407 de = dictFind(c->db->blockingkeys,keys[j]);
6408 if (de == NULL) {
6409 int retval;
6410
6411 /* For every key we take a list of clients blocked for it */
6412 l = listCreate();
6413 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6414 incrRefCount(keys[j]);
6415 assert(retval == DICT_OK);
6416 } else {
6417 l = dictGetEntryVal(de);
6418 }
6419 listAddNodeTail(l,c);
4409877e 6420 }
b177fd30 6421 /* Mark the client as a blocked client */
4409877e 6422 c->flags |= REDIS_BLOCKED;
d5d55fc3 6423 server.blpop_blocked_clients++;
4409877e 6424}
6425
6426/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 6427static void unblockClientWaitingData(redisClient *c) {
4409877e 6428 dictEntry *de;
6429 list *l;
b177fd30 6430 int j;
4409877e 6431
b177fd30 6432 assert(c->blockingkeys != NULL);
6433 /* The client may wait for multiple keys, so unblock it for every key. */
6434 for (j = 0; j < c->blockingkeysnum; j++) {
6435 /* Remove this client from the list of clients waiting for this key. */
6436 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6437 assert(de != NULL);
6438 l = dictGetEntryVal(de);
6439 listDelNode(l,listSearchKey(l,c));
6440 /* If the list is empty we need to remove it to avoid wasting memory */
6441 if (listLength(l) == 0)
6442 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6443 decrRefCount(c->blockingkeys[j]);
6444 }
6445 /* Cleanup the client structure */
6446 zfree(c->blockingkeys);
6447 c->blockingkeys = NULL;
4409877e 6448 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 6449 server.blpop_blocked_clients--;
5921aa36 6450 /* We want to process data if there is some command waiting
b0d8747d 6451 * in the input buffer. Note that this is safe even if
6452 * unblockClientWaitingData() gets called from freeClient() because
6453 * freeClient() will be smart enough to call this function
6454 * *after* c->querybuf was set to NULL. */
4409877e 6455 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6456}
6457
6458/* This should be called from any function PUSHing into lists.
6459 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6460 * 'ele' is the element pushed.
6461 *
6462 * If the function returns 0 there was no client waiting for a list push
6463 * against this key.
6464 *
6465 * If the function returns 1 there was a client waiting for a list push
6466 * against this key, the element was passed to this client thus it's not
6467 * needed to actually add it to the list and the caller should return asap. */
6468static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6469 struct dictEntry *de;
6470 redisClient *receiver;
6471 list *l;
6472 listNode *ln;
6473
6474 de = dictFind(c->db->blockingkeys,key);
6475 if (de == NULL) return 0;
6476 l = dictGetEntryVal(de);
6477 ln = listFirst(l);
6478 assert(ln != NULL);
6479 receiver = ln->value;
4409877e 6480
b177fd30 6481 addReplySds(receiver,sdsnew("*2\r\n"));
6482 addReplyBulkLen(receiver,key);
6483 addReply(receiver,key);
6484 addReply(receiver,shared.crlf);
4409877e 6485 addReplyBulkLen(receiver,ele);
6486 addReply(receiver,ele);
6487 addReply(receiver,shared.crlf);
b0d8747d 6488 unblockClientWaitingData(receiver);
4409877e 6489 return 1;
6490}
6491
6492/* Blocking RPOP/LPOP */
6493static void blockingPopGenericCommand(redisClient *c, int where) {
6494 robj *o;
6495 time_t timeout;
b177fd30 6496 int j;
4409877e 6497
b177fd30 6498 for (j = 1; j < c->argc-1; j++) {
6499 o = lookupKeyWrite(c->db,c->argv[j]);
6500 if (o != NULL) {
6501 if (o->type != REDIS_LIST) {
6502 addReply(c,shared.wrongtypeerr);
4409877e 6503 return;
b177fd30 6504 } else {
6505 list *list = o->ptr;
6506 if (listLength(list) != 0) {
6507 /* If the list contains elements fall back to the usual
6508 * non-blocking POP operation */
6509 robj *argv[2], **orig_argv;
6510 int orig_argc;
6511
6512 /* We need to alter the command arguments before to call
6513 * popGenericCommand() as the command takes a single key. */
6514 orig_argv = c->argv;
6515 orig_argc = c->argc;
6516 argv[1] = c->argv[j];
6517 c->argv = argv;
6518 c->argc = 2;
6519
6520 /* Also the return value is different, we need to output
6521 * the multi bulk reply header and the key name. The
6522 * "real" command will add the last element (the value)
6523 * for us. If this souds like an hack to you it's just
6524 * because it is... */
6525 addReplySds(c,sdsnew("*2\r\n"));
6526 addReplyBulkLen(c,argv[1]);
6527 addReply(c,argv[1]);
6528 addReply(c,shared.crlf);
6529 popGenericCommand(c,where);
6530
6531 /* Fix the client structure with the original stuff */
6532 c->argv = orig_argv;
6533 c->argc = orig_argc;
6534 return;
6535 }
4409877e 6536 }
6537 }
6538 }
6539 /* If the list is empty or the key does not exists we must block */
b177fd30 6540 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 6541 if (timeout > 0) timeout += time(NULL);
b177fd30 6542 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 6543}
6544
6545static void blpopCommand(redisClient *c) {
6546 blockingPopGenericCommand(c,REDIS_HEAD);
6547}
6548
6549static void brpopCommand(redisClient *c) {
6550 blockingPopGenericCommand(c,REDIS_TAIL);
6551}
6552
ed9b544e 6553/* =============================== Replication ============================= */
6554
a4d1ba9a 6555static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 6556 ssize_t nwritten, ret = size;
6557 time_t start = time(NULL);
6558
6559 timeout++;
6560 while(size) {
6561 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6562 nwritten = write(fd,ptr,size);
6563 if (nwritten == -1) return -1;
6564 ptr += nwritten;
6565 size -= nwritten;
6566 }
6567 if ((time(NULL)-start) > timeout) {
6568 errno = ETIMEDOUT;
6569 return -1;
6570 }
6571 }
6572 return ret;
6573}
6574
a4d1ba9a 6575static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 6576 ssize_t nread, totread = 0;
6577 time_t start = time(NULL);
6578
6579 timeout++;
6580 while(size) {
6581 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6582 nread = read(fd,ptr,size);
6583 if (nread == -1) return -1;
6584 ptr += nread;
6585 size -= nread;
6586 totread += nread;
6587 }
6588 if ((time(NULL)-start) > timeout) {
6589 errno = ETIMEDOUT;
6590 return -1;
6591 }
6592 }
6593 return totread;
6594}
6595
6596static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6597 ssize_t nread = 0;
6598
6599 size--;
6600 while(size) {
6601 char c;
6602
6603 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6604 if (c == '\n') {
6605 *ptr = '\0';
6606 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6607 return nread;
6608 } else {
6609 *ptr++ = c;
6610 *ptr = '\0';
6611 nread++;
6612 }
6613 }
6614 return nread;
6615}
6616
6617static void syncCommand(redisClient *c) {
40d224a9 6618 /* ignore SYNC if aleady slave or in monitor mode */
6619 if (c->flags & REDIS_SLAVE) return;
6620
6621 /* SYNC can't be issued when the server has pending data to send to
6622 * the client about already issued commands. We need a fresh reply
6623 * buffer registering the differences between the BGSAVE and the current
6624 * dataset, so that we can copy to other slaves if needed. */
6625 if (listLength(c->reply) != 0) {
6626 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6627 return;
6628 }
6629
6630 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6631 /* Here we need to check if there is a background saving operation
6632 * in progress, or if it is required to start one */
9d65a1bb 6633 if (server.bgsavechildpid != -1) {
40d224a9 6634 /* Ok a background save is in progress. Let's check if it is a good
6635 * one for replication, i.e. if there is another slave that is
6636 * registering differences since the server forked to save */
6637 redisClient *slave;
6638 listNode *ln;
c7df85a4 6639 listIter li;
40d224a9 6640
c7df85a4 6641 listRewind(server.slaves,&li);
6642 while((ln = listNext(&li))) {
40d224a9 6643 slave = ln->value;
6644 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 6645 }
6646 if (ln) {
6647 /* Perfect, the server is already registering differences for
6648 * another slave. Set the right state, and copy the buffer. */
6649 listRelease(c->reply);
6650 c->reply = listDup(slave->reply);
40d224a9 6651 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6652 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6653 } else {
6654 /* No way, we need to wait for the next BGSAVE in order to
6655 * register differences */
6656 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6657 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6658 }
6659 } else {
6660 /* Ok we don't have a BGSAVE in progress, let's start one */
6661 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6662 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6663 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6664 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6665 return;
6666 }
6667 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6668 }
6208b3a7 6669 c->repldbfd = -1;
40d224a9 6670 c->flags |= REDIS_SLAVE;
6671 c->slaveseldb = 0;
6b47e12e 6672 listAddNodeTail(server.slaves,c);
40d224a9 6673 return;
6674}
6675
6208b3a7 6676static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6677 redisClient *slave = privdata;
6678 REDIS_NOTUSED(el);
6679 REDIS_NOTUSED(mask);
6680 char buf[REDIS_IOBUF_LEN];
6681 ssize_t nwritten, buflen;
6682
6683 if (slave->repldboff == 0) {
6684 /* Write the bulk write count before to transfer the DB. In theory here
6685 * we don't know how much room there is in the output buffer of the
6686 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6687 * operations) will never be smaller than the few bytes we need. */
6688 sds bulkcount;
6689
6690 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6691 slave->repldbsize);
6692 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6693 {
6694 sdsfree(bulkcount);
6695 freeClient(slave);
6696 return;
6697 }
6698 sdsfree(bulkcount);
6699 }
6700 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6701 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6702 if (buflen <= 0) {
6703 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6704 (buflen == 0) ? "premature EOF" : strerror(errno));
6705 freeClient(slave);
6706 return;
6707 }
6708 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 6709 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 6710 strerror(errno));
6711 freeClient(slave);
6712 return;
6713 }
6714 slave->repldboff += nwritten;
6715 if (slave->repldboff == slave->repldbsize) {
6716 close(slave->repldbfd);
6717 slave->repldbfd = -1;
6718 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6719 slave->replstate = REDIS_REPL_ONLINE;
6720 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 6721 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 6722 freeClient(slave);
6723 return;
6724 }
6725 addReplySds(slave,sdsempty());
6726 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6727 }
6728}
ed9b544e 6729
a3b21203 6730/* This function is called at the end of every backgrond saving.
6731 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6732 * otherwise REDIS_ERR is passed to the function.
6733 *
6734 * The goal of this function is to handle slaves waiting for a successful
6735 * background saving in order to perform non-blocking synchronization. */
6736static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 6737 listNode *ln;
6738 int startbgsave = 0;
c7df85a4 6739 listIter li;
ed9b544e 6740
c7df85a4 6741 listRewind(server.slaves,&li);
6742 while((ln = listNext(&li))) {
6208b3a7 6743 redisClient *slave = ln->value;
ed9b544e 6744
6208b3a7 6745 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6746 startbgsave = 1;
6747 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6748 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 6749 struct redis_stat buf;
6208b3a7 6750
6751 if (bgsaveerr != REDIS_OK) {
6752 freeClient(slave);
6753 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6754 continue;
6755 }
6756 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 6757 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 6758 freeClient(slave);
6759 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6760 continue;
6761 }
6762 slave->repldboff = 0;
6763 slave->repldbsize = buf.st_size;
6764 slave->replstate = REDIS_REPL_SEND_BULK;
6765 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 6766 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 6767 freeClient(slave);
6768 continue;
6769 }
6770 }
ed9b544e 6771 }
6208b3a7 6772 if (startbgsave) {
6773 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 6774 listIter li;
6775
6776 listRewind(server.slaves,&li);
6208b3a7 6777 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 6778 while((ln = listNext(&li))) {
6208b3a7 6779 redisClient *slave = ln->value;
ed9b544e 6780
6208b3a7 6781 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6782 freeClient(slave);
6783 }
6784 }
6785 }
ed9b544e 6786}
6787
6788static int syncWithMaster(void) {
d0ccebcf 6789 char buf[1024], tmpfile[256], authcmd[1024];
ed9b544e 6790 int dumpsize;
6791 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6792 int dfd;
6793
6794 if (fd == -1) {
6795 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6796 strerror(errno));
6797 return REDIS_ERR;
6798 }
d0ccebcf 6799
6800 /* AUTH with the master if required. */
6801 if(server.masterauth) {
6802 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6803 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6804 close(fd);
6805 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6806 strerror(errno));
6807 return REDIS_ERR;
6808 }
6809 /* Read the AUTH result. */
6810 if (syncReadLine(fd,buf,1024,3600) == -1) {
6811 close(fd);
6812 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6813 strerror(errno));
6814 return REDIS_ERR;
6815 }
6816 if (buf[0] != '+') {
6817 close(fd);
6818 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6819 return REDIS_ERR;
6820 }
6821 }
6822
ed9b544e 6823 /* Issue the SYNC command */
6824 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6825 close(fd);
6826 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6827 strerror(errno));
6828 return REDIS_ERR;
6829 }
6830 /* Read the bulk write count */
8c4d91fc 6831 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 6832 close(fd);
6833 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6834 strerror(errno));
6835 return REDIS_ERR;
6836 }
4aa701c1 6837 if (buf[0] != '$') {
6838 close(fd);
6839 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6840 return REDIS_ERR;
6841 }
c937aa89 6842 dumpsize = atoi(buf+1);
ed9b544e 6843 redisLog(REDIS_NOTICE,"Receiving %d bytes data dump from MASTER",dumpsize);
6844 /* Read the bulk write data on a temp file */
6845 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6846 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6847 if (dfd == -1) {
6848 close(fd);
6849 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6850 return REDIS_ERR;
6851 }
6852 while(dumpsize) {
6853 int nread, nwritten;
6854
6855 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6856 if (nread == -1) {
6857 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6858 strerror(errno));
6859 close(fd);
6860 close(dfd);
6861 return REDIS_ERR;
6862 }
6863 nwritten = write(dfd,buf,nread);
6864 if (nwritten == -1) {
6865 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6866 close(fd);
6867 close(dfd);
6868 return REDIS_ERR;
6869 }
6870 dumpsize -= nread;
6871 }
6872 close(dfd);
6873 if (rename(tmpfile,server.dbfilename) == -1) {
6874 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
6875 unlink(tmpfile);
6876 close(fd);
6877 return REDIS_ERR;
6878 }
6879 emptyDb();
f78fd11b 6880 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 6881 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
6882 close(fd);
6883 return REDIS_ERR;
6884 }
6885 server.master = createClient(fd);
6886 server.master->flags |= REDIS_MASTER;
179b3952 6887 server.master->authenticated = 1;
ed9b544e 6888 server.replstate = REDIS_REPL_CONNECTED;
6889 return REDIS_OK;
6890}
6891
321b0e13 6892static void slaveofCommand(redisClient *c) {
6893 if (!strcasecmp(c->argv[1]->ptr,"no") &&
6894 !strcasecmp(c->argv[2]->ptr,"one")) {
6895 if (server.masterhost) {
6896 sdsfree(server.masterhost);
6897 server.masterhost = NULL;
6898 if (server.master) freeClient(server.master);
6899 server.replstate = REDIS_REPL_NONE;
6900 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
6901 }
6902 } else {
6903 sdsfree(server.masterhost);
6904 server.masterhost = sdsdup(c->argv[1]->ptr);
6905 server.masterport = atoi(c->argv[2]->ptr);
6906 if (server.master) freeClient(server.master);
6907 server.replstate = REDIS_REPL_CONNECT;
6908 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
6909 server.masterhost, server.masterport);
6910 }
6911 addReply(c,shared.ok);
6912}
6913
3fd78bcd 6914/* ============================ Maxmemory directive ======================== */
6915
a5819310 6916/* Try to free one object form the pre-allocated objects free list.
6917 * This is useful under low mem conditions as by default we take 1 million
6918 * free objects allocated. On success REDIS_OK is returned, otherwise
6919 * REDIS_ERR. */
6920static int tryFreeOneObjectFromFreelist(void) {
f870935d 6921 robj *o;
6922
a5819310 6923 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
6924 if (listLength(server.objfreelist)) {
6925 listNode *head = listFirst(server.objfreelist);
6926 o = listNodeValue(head);
6927 listDelNode(server.objfreelist,head);
6928 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6929 zfree(o);
6930 return REDIS_OK;
6931 } else {
6932 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6933 return REDIS_ERR;
6934 }
f870935d 6935}
6936
3fd78bcd 6937/* This function gets called when 'maxmemory' is set on the config file to limit
6938 * the max memory used by the server, and we are out of memory.
6939 * This function will try to, in order:
6940 *
6941 * - Free objects from the free list
6942 * - Try to remove keys with an EXPIRE set
6943 *
6944 * It is not possible to free enough memory to reach used-memory < maxmemory
6945 * the server will start refusing commands that will enlarge even more the
6946 * memory usage.
6947 */
6948static void freeMemoryIfNeeded(void) {
6949 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 6950 int j, k, freed = 0;
6951
6952 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
6953 for (j = 0; j < server.dbnum; j++) {
6954 int minttl = -1;
6955 robj *minkey = NULL;
6956 struct dictEntry *de;
6957
6958 if (dictSize(server.db[j].expires)) {
6959 freed = 1;
6960 /* From a sample of three keys drop the one nearest to
6961 * the natural expire */
6962 for (k = 0; k < 3; k++) {
6963 time_t t;
6964
6965 de = dictGetRandomKey(server.db[j].expires);
6966 t = (time_t) dictGetEntryVal(de);
6967 if (minttl == -1 || t < minttl) {
6968 minkey = dictGetEntryKey(de);
6969 minttl = t;
3fd78bcd 6970 }
3fd78bcd 6971 }
a5819310 6972 deleteKey(server.db+j,minkey);
3fd78bcd 6973 }
3fd78bcd 6974 }
a5819310 6975 if (!freed) return; /* nothing to free... */
3fd78bcd 6976 }
6977}
6978
f80dff62 6979/* ============================== Append Only file ========================== */
6980
6981static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6982 sds buf = sdsempty();
6983 int j;
6984 ssize_t nwritten;
6985 time_t now;
6986 robj *tmpargv[3];
6987
6988 /* The DB this command was targetting is not the same as the last command
6989 * we appendend. To issue a SELECT command is needed. */
6990 if (dictid != server.appendseldb) {
6991 char seldb[64];
6992
6993 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 6994 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 6995 (unsigned long)strlen(seldb),seldb);
f80dff62 6996 server.appendseldb = dictid;
6997 }
6998
6999 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7000 * EXPIREs into EXPIREATs calls */
7001 if (cmd->proc == expireCommand) {
7002 long when;
7003
7004 tmpargv[0] = createStringObject("EXPIREAT",8);
7005 tmpargv[1] = argv[1];
7006 incrRefCount(argv[1]);
7007 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7008 tmpargv[2] = createObject(REDIS_STRING,
7009 sdscatprintf(sdsempty(),"%ld",when));
7010 argv = tmpargv;
7011 }
7012
7013 /* Append the actual command */
7014 buf = sdscatprintf(buf,"*%d\r\n",argc);
7015 for (j = 0; j < argc; j++) {
7016 robj *o = argv[j];
7017
9d65a1bb 7018 o = getDecodedObject(o);
83c6a618 7019 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 7020 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7021 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 7022 decrRefCount(o);
f80dff62 7023 }
7024
7025 /* Free the objects from the modified argv for EXPIREAT */
7026 if (cmd->proc == expireCommand) {
7027 for (j = 0; j < 3; j++)
7028 decrRefCount(argv[j]);
7029 }
7030
7031 /* We want to perform a single write. This should be guaranteed atomic
7032 * at least if the filesystem we are writing is a real physical one.
7033 * While this will save us against the server being killed I don't think
7034 * there is much to do about the whole server stopping for power problems
7035 * or alike */
7036 nwritten = write(server.appendfd,buf,sdslen(buf));
7037 if (nwritten != (signed)sdslen(buf)) {
7038 /* Ooops, we are in troubles. The best thing to do for now is
7039 * to simply exit instead to give the illusion that everything is
7040 * working as expected. */
7041 if (nwritten == -1) {
7042 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7043 } else {
7044 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7045 }
7046 exit(1);
7047 }
85a83172 7048 /* If a background append only file rewriting is in progress we want to
7049 * accumulate the differences between the child DB and the current one
7050 * in a buffer, so that when the child process will do its work we
7051 * can append the differences to the new append only file. */
7052 if (server.bgrewritechildpid != -1)
7053 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7054
7055 sdsfree(buf);
f80dff62 7056 now = time(NULL);
7057 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7058 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7059 now-server.lastfsync > 1))
7060 {
7061 fsync(server.appendfd); /* Let's try to get this data on the disk */
7062 server.lastfsync = now;
7063 }
7064}
7065
7066/* In Redis commands are always executed in the context of a client, so in
7067 * order to load the append only file we need to create a fake client. */
7068static struct redisClient *createFakeClient(void) {
7069 struct redisClient *c = zmalloc(sizeof(*c));
7070
7071 selectDb(c,0);
7072 c->fd = -1;
7073 c->querybuf = sdsempty();
7074 c->argc = 0;
7075 c->argv = NULL;
7076 c->flags = 0;
9387d17d 7077 /* We set the fake client as a slave waiting for the synchronization
7078 * so that Redis will not try to send replies to this client. */
7079 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 7080 c->reply = listCreate();
7081 listSetFreeMethod(c->reply,decrRefCount);
7082 listSetDupMethod(c->reply,dupClientReplyValue);
7083 return c;
7084}
7085
7086static void freeFakeClient(struct redisClient *c) {
7087 sdsfree(c->querybuf);
7088 listRelease(c->reply);
7089 zfree(c);
7090}
7091
7092/* Replay the append log file. On error REDIS_OK is returned. On non fatal
7093 * error (the append only file is zero-length) REDIS_ERR is returned. On
7094 * fatal error an error message is logged and the program exists. */
7095int loadAppendOnlyFile(char *filename) {
7096 struct redisClient *fakeClient;
7097 FILE *fp = fopen(filename,"r");
7098 struct redis_stat sb;
b492cf00 7099 unsigned long long loadedkeys = 0;
f80dff62 7100
7101 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7102 return REDIS_ERR;
7103
7104 if (fp == NULL) {
7105 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7106 exit(1);
7107 }
7108
7109 fakeClient = createFakeClient();
7110 while(1) {
7111 int argc, j;
7112 unsigned long len;
7113 robj **argv;
7114 char buf[128];
7115 sds argsds;
7116 struct redisCommand *cmd;
7117
7118 if (fgets(buf,sizeof(buf),fp) == NULL) {
7119 if (feof(fp))
7120 break;
7121 else
7122 goto readerr;
7123 }
7124 if (buf[0] != '*') goto fmterr;
7125 argc = atoi(buf+1);
7126 argv = zmalloc(sizeof(robj*)*argc);
7127 for (j = 0; j < argc; j++) {
7128 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7129 if (buf[0] != '$') goto fmterr;
7130 len = strtol(buf+1,NULL,10);
7131 argsds = sdsnewlen(NULL,len);
0f151ef1 7132 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 7133 argv[j] = createObject(REDIS_STRING,argsds);
7134 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7135 }
7136
7137 /* Command lookup */
7138 cmd = lookupCommand(argv[0]->ptr);
7139 if (!cmd) {
7140 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7141 exit(1);
7142 }
7143 /* Try object sharing and encoding */
7144 if (server.shareobjects) {
7145 int j;
7146 for(j = 1; j < argc; j++)
7147 argv[j] = tryObjectSharing(argv[j]);
7148 }
7149 if (cmd->flags & REDIS_CMD_BULK)
7150 tryObjectEncoding(argv[argc-1]);
7151 /* Run the command in the context of a fake client */
7152 fakeClient->argc = argc;
7153 fakeClient->argv = argv;
7154 cmd->proc(fakeClient);
7155 /* Discard the reply objects list from the fake client */
7156 while(listLength(fakeClient->reply))
7157 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7158 /* Clean up, ready for the next command */
7159 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7160 zfree(argv);
b492cf00 7161 /* Handle swapping while loading big datasets when VM is on */
7162 loadedkeys++;
7163 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7164 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 7165 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 7166 }
7167 }
f80dff62 7168 }
7169 fclose(fp);
7170 freeFakeClient(fakeClient);
7171 return REDIS_OK;
7172
7173readerr:
7174 if (feof(fp)) {
7175 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7176 } else {
7177 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7178 }
7179 exit(1);
7180fmterr:
7181 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7182 exit(1);
7183}
7184
9d65a1bb 7185/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7186static int fwriteBulk(FILE *fp, robj *obj) {
7187 char buf[128];
b9bc0eef 7188 int decrrc = 0;
7189
f2d9f50f 7190 /* Avoid the incr/decr ref count business if possible to help
7191 * copy-on-write (we are often in a child process when this function
7192 * is called).
7193 * Also makes sure that key objects don't get incrRefCount-ed when VM
7194 * is enabled */
7195 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 7196 obj = getDecodedObject(obj);
7197 decrrc = 1;
7198 }
9d65a1bb 7199 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7200 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 7201 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7202 goto err;
9d65a1bb 7203 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 7204 if (decrrc) decrRefCount(obj);
9d65a1bb 7205 return 1;
7206err:
b9bc0eef 7207 if (decrrc) decrRefCount(obj);
9d65a1bb 7208 return 0;
7209}
7210
7211/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7212static int fwriteBulkDouble(FILE *fp, double d) {
7213 char buf[128], dbuf[128];
7214
7215 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7216 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7217 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7218 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7219 return 1;
7220}
7221
7222/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7223static int fwriteBulkLong(FILE *fp, long l) {
7224 char buf[128], lbuf[128];
7225
7226 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7227 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7228 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7229 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7230 return 1;
7231}
7232
7233/* Write a sequence of commands able to fully rebuild the dataset into
7234 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7235static int rewriteAppendOnlyFile(char *filename) {
7236 dictIterator *di = NULL;
7237 dictEntry *de;
7238 FILE *fp;
7239 char tmpfile[256];
7240 int j;
7241 time_t now = time(NULL);
7242
7243 /* Note that we have to use a different temp name here compared to the
7244 * one used by rewriteAppendOnlyFileBackground() function. */
7245 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7246 fp = fopen(tmpfile,"w");
7247 if (!fp) {
7248 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7249 return REDIS_ERR;
7250 }
7251 for (j = 0; j < server.dbnum; j++) {
7252 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7253 redisDb *db = server.db+j;
7254 dict *d = db->dict;
7255 if (dictSize(d) == 0) continue;
7256 di = dictGetIterator(d);
7257 if (!di) {
7258 fclose(fp);
7259 return REDIS_ERR;
7260 }
7261
7262 /* SELECT the new DB */
7263 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 7264 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 7265
7266 /* Iterate this DB writing every entry */
7267 while((de = dictNext(di)) != NULL) {
e7546c63 7268 robj *key, *o;
7269 time_t expiretime;
7270 int swapped;
7271
7272 key = dictGetEntryKey(de);
b9bc0eef 7273 /* If the value for this key is swapped, load a preview in memory.
7274 * We use a "swapped" flag to remember if we need to free the
7275 * value object instead to just increment the ref count anyway
7276 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 7277 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7278 key->storage == REDIS_VM_SWAPPING) {
e7546c63 7279 o = dictGetEntryVal(de);
7280 swapped = 0;
7281 } else {
7282 o = vmPreviewObject(key);
e7546c63 7283 swapped = 1;
7284 }
7285 expiretime = getExpire(db,key);
9d65a1bb 7286
7287 /* Save the key and associated value */
9d65a1bb 7288 if (o->type == REDIS_STRING) {
7289 /* Emit a SET command */
7290 char cmd[]="*3\r\n$3\r\nSET\r\n";
7291 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7292 /* Key and value */
7293 if (fwriteBulk(fp,key) == 0) goto werr;
7294 if (fwriteBulk(fp,o) == 0) goto werr;
7295 } else if (o->type == REDIS_LIST) {
7296 /* Emit the RPUSHes needed to rebuild the list */
7297 list *list = o->ptr;
7298 listNode *ln;
c7df85a4 7299 listIter li;
9d65a1bb 7300
c7df85a4 7301 listRewind(list,&li);
7302 while((ln = listNext(&li))) {
9d65a1bb 7303 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7304 robj *eleobj = listNodeValue(ln);
7305
7306 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7307 if (fwriteBulk(fp,key) == 0) goto werr;
7308 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7309 }
7310 } else if (o->type == REDIS_SET) {
7311 /* Emit the SADDs needed to rebuild the set */
7312 dict *set = o->ptr;
7313 dictIterator *di = dictGetIterator(set);
7314 dictEntry *de;
7315
7316 while((de = dictNext(di)) != NULL) {
7317 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7318 robj *eleobj = dictGetEntryKey(de);
7319
7320 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7321 if (fwriteBulk(fp,key) == 0) goto werr;
7322 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7323 }
7324 dictReleaseIterator(di);
7325 } else if (o->type == REDIS_ZSET) {
7326 /* Emit the ZADDs needed to rebuild the sorted set */
7327 zset *zs = o->ptr;
7328 dictIterator *di = dictGetIterator(zs->dict);
7329 dictEntry *de;
7330
7331 while((de = dictNext(di)) != NULL) {
7332 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7333 robj *eleobj = dictGetEntryKey(de);
7334 double *score = dictGetEntryVal(de);
7335
7336 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7337 if (fwriteBulk(fp,key) == 0) goto werr;
7338 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7339 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7340 }
7341 dictReleaseIterator(di);
7342 } else {
dfc5e96c 7343 redisAssert(0 != 0);
9d65a1bb 7344 }
7345 /* Save the expire time */
7346 if (expiretime != -1) {
e96e4fbf 7347 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 7348 /* If this key is already expired skip it */
7349 if (expiretime < now) continue;
7350 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7351 if (fwriteBulk(fp,key) == 0) goto werr;
7352 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7353 }
b9bc0eef 7354 if (swapped) decrRefCount(o);
9d65a1bb 7355 }
7356 dictReleaseIterator(di);
7357 }
7358
7359 /* Make sure data will not remain on the OS's output buffers */
7360 fflush(fp);
7361 fsync(fileno(fp));
7362 fclose(fp);
7363
7364 /* Use RENAME to make sure the DB file is changed atomically only
7365 * if the generate DB file is ok. */
7366 if (rename(tmpfile,filename) == -1) {
7367 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7368 unlink(tmpfile);
7369 return REDIS_ERR;
7370 }
7371 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7372 return REDIS_OK;
7373
7374werr:
7375 fclose(fp);
7376 unlink(tmpfile);
e96e4fbf 7377 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 7378 if (di) dictReleaseIterator(di);
7379 return REDIS_ERR;
7380}
7381
7382/* This is how rewriting of the append only file in background works:
7383 *
7384 * 1) The user calls BGREWRITEAOF
7385 * 2) Redis calls this function, that forks():
7386 * 2a) the child rewrite the append only file in a temp file.
7387 * 2b) the parent accumulates differences in server.bgrewritebuf.
7388 * 3) When the child finished '2a' exists.
7389 * 4) The parent will trap the exit code, if it's OK, will append the
7390 * data accumulated into server.bgrewritebuf into the temp file, and
7391 * finally will rename(2) the temp file in the actual file name.
7392 * The the new file is reopened as the new append only file. Profit!
7393 */
7394static int rewriteAppendOnlyFileBackground(void) {
7395 pid_t childpid;
7396
7397 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 7398 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 7399 if ((childpid = fork()) == 0) {
7400 /* Child */
7401 char tmpfile[256];
9d65a1bb 7402
054e426d 7403 if (server.vm_enabled) vmReopenSwapFile();
7404 close(server.fd);
9d65a1bb 7405 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7406 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 7407 _exit(0);
9d65a1bb 7408 } else {
478c2c6f 7409 _exit(1);
9d65a1bb 7410 }
7411 } else {
7412 /* Parent */
7413 if (childpid == -1) {
7414 redisLog(REDIS_WARNING,
7415 "Can't rewrite append only file in background: fork: %s",
7416 strerror(errno));
7417 return REDIS_ERR;
7418 }
7419 redisLog(REDIS_NOTICE,
7420 "Background append only file rewriting started by pid %d",childpid);
7421 server.bgrewritechildpid = childpid;
85a83172 7422 /* We set appendseldb to -1 in order to force the next call to the
7423 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7424 * accumulated by the parent into server.bgrewritebuf will start
7425 * with a SELECT statement and it will be safe to merge. */
7426 server.appendseldb = -1;
9d65a1bb 7427 return REDIS_OK;
7428 }
7429 return REDIS_OK; /* unreached */
7430}
7431
7432static void bgrewriteaofCommand(redisClient *c) {
7433 if (server.bgrewritechildpid != -1) {
7434 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7435 return;
7436 }
7437 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 7438 char *status = "+Background append only file rewriting started\r\n";
7439 addReplySds(c,sdsnew(status));
9d65a1bb 7440 } else {
7441 addReply(c,shared.err);
7442 }
7443}
7444
7445static void aofRemoveTempFile(pid_t childpid) {
7446 char tmpfile[256];
7447
7448 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7449 unlink(tmpfile);
7450}
7451
996cb5f7 7452/* Virtual Memory is composed mainly of two subsystems:
7453 * - Blocking Virutal Memory
7454 * - Threaded Virtual Memory I/O
7455 * The two parts are not fully decoupled, but functions are split among two
7456 * different sections of the source code (delimited by comments) in order to
7457 * make more clear what functionality is about the blocking VM and what about
7458 * the threaded (not blocking) VM.
7459 *
7460 * Redis VM design:
7461 *
7462 * Redis VM is a blocking VM (one that blocks reading swapped values from
7463 * disk into memory when a value swapped out is needed in memory) that is made
7464 * unblocking by trying to examine the command argument vector in order to
7465 * load in background values that will likely be needed in order to exec
7466 * the command. The command is executed only once all the relevant keys
7467 * are loaded into memory.
7468 *
7469 * This basically is almost as simple of a blocking VM, but almost as parallel
7470 * as a fully non-blocking VM.
7471 */
7472
7473/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 7474
7475/* substitute the first occurrence of '%p' with the process pid in the
7476 * swap file name. */
7477static void expandVmSwapFilename(void) {
7478 char *p = strstr(server.vm_swap_file,"%p");
7479 sds new;
7480
7481 if (!p) return;
7482 new = sdsempty();
7483 *p = '\0';
7484 new = sdscat(new,server.vm_swap_file);
7485 new = sdscatprintf(new,"%ld",(long) getpid());
7486 new = sdscat(new,p+2);
7487 zfree(server.vm_swap_file);
7488 server.vm_swap_file = new;
7489}
7490
75680a3c 7491static void vmInit(void) {
7492 off_t totsize;
996cb5f7 7493 int pipefds[2];
bcaa7a4f 7494 size_t stacksize;
75680a3c 7495
4ad37480 7496 if (server.vm_max_threads != 0)
7497 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7498
054e426d 7499 expandVmSwapFilename();
7500 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 7501 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
7502 server.vm_fp = fopen(server.vm_swap_file,"w+b");
7503 }
75680a3c 7504 if (server.vm_fp == NULL) {
6fa987e3 7505 redisLog(REDIS_WARNING,
7506 "Impossible to open the swap file: %s. Exiting.",
7507 strerror(errno));
75680a3c 7508 exit(1);
7509 }
7510 server.vm_fd = fileno(server.vm_fp);
7511 server.vm_next_page = 0;
7512 server.vm_near_pages = 0;
7d98e08c 7513 server.vm_stats_used_pages = 0;
7514 server.vm_stats_swapped_objects = 0;
7515 server.vm_stats_swapouts = 0;
7516 server.vm_stats_swapins = 0;
75680a3c 7517 totsize = server.vm_pages*server.vm_page_size;
7518 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
7519 if (ftruncate(server.vm_fd,totsize) == -1) {
7520 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
7521 strerror(errno));
7522 exit(1);
7523 } else {
7524 redisLog(REDIS_NOTICE,"Swap file allocated with success");
7525 }
7d30035d 7526 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 7527 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 7528 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 7529 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 7530
996cb5f7 7531 /* Initialize threaded I/O (used by Virtual Memory) */
7532 server.io_newjobs = listCreate();
7533 server.io_processing = listCreate();
7534 server.io_processed = listCreate();
d5d55fc3 7535 server.io_ready_clients = listCreate();
92f8e882 7536 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 7537 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
7538 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 7539 server.io_active_threads = 0;
996cb5f7 7540 if (pipe(pipefds) == -1) {
7541 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
7542 ,strerror(errno));
7543 exit(1);
7544 }
7545 server.io_ready_pipe_read = pipefds[0];
7546 server.io_ready_pipe_write = pipefds[1];
7547 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 7548 /* LZF requires a lot of stack */
7549 pthread_attr_init(&server.io_threads_attr);
7550 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
7551 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
7552 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 7553 /* Listen for events in the threaded I/O pipe */
7554 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
7555 vmThreadedIOCompletedJob, NULL) == AE_ERR)
7556 oom("creating file event");
75680a3c 7557}
7558
06224fec 7559/* Mark the page as used */
7560static void vmMarkPageUsed(off_t page) {
7561 off_t byte = page/8;
7562 int bit = page&7;
970e10bb 7563 redisAssert(vmFreePage(page) == 1);
06224fec 7564 server.vm_bitmap[byte] |= 1<<bit;
7565}
7566
7567/* Mark N contiguous pages as used, with 'page' being the first. */
7568static void vmMarkPagesUsed(off_t page, off_t count) {
7569 off_t j;
7570
7571 for (j = 0; j < count; j++)
7d30035d 7572 vmMarkPageUsed(page+j);
7d98e08c 7573 server.vm_stats_used_pages += count;
7c775e09 7574 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
7575 (long long)count, (long long)page);
06224fec 7576}
7577
7578/* Mark the page as free */
7579static void vmMarkPageFree(off_t page) {
7580 off_t byte = page/8;
7581 int bit = page&7;
970e10bb 7582 redisAssert(vmFreePage(page) == 0);
06224fec 7583 server.vm_bitmap[byte] &= ~(1<<bit);
7584}
7585
7586/* Mark N contiguous pages as free, with 'page' being the first. */
7587static void vmMarkPagesFree(off_t page, off_t count) {
7588 off_t j;
7589
7590 for (j = 0; j < count; j++)
7d30035d 7591 vmMarkPageFree(page+j);
7d98e08c 7592 server.vm_stats_used_pages -= count;
7c775e09 7593 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
7594 (long long)count, (long long)page);
06224fec 7595}
7596
7597/* Test if the page is free */
7598static int vmFreePage(off_t page) {
7599 off_t byte = page/8;
7600 int bit = page&7;
7d30035d 7601 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 7602}
7603
7604/* Find N contiguous free pages storing the first page of the cluster in *first.
3a66edc7 7605 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7606 * REDIS_ERR is returned.
06224fec 7607 *
7608 * This function uses a simple algorithm: we try to allocate
7609 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7610 * again from the start of the swap file searching for free spaces.
7611 *
7612 * If it looks pretty clear that there are no free pages near our offset
7613 * we try to find less populated places doing a forward jump of
7614 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7615 * without hurry, and then we jump again and so forth...
7616 *
7617 * This function can be improved using a free list to avoid to guess
7618 * too much, since we could collect data about freed pages.
7619 *
7620 * note: I implemented this function just after watching an episode of
7621 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7622 */
c7df85a4 7623static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 7624 off_t base, offset = 0, since_jump = 0, numfree = 0;
7625
7626 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7627 server.vm_near_pages = 0;
7628 server.vm_next_page = 0;
7629 }
7630 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7631 base = server.vm_next_page;
7632
7633 while(offset < server.vm_pages) {
7634 off_t this = base+offset;
7635
7636 /* If we overflow, restart from page zero */
7637 if (this >= server.vm_pages) {
7638 this -= server.vm_pages;
7639 if (this == 0) {
7640 /* Just overflowed, what we found on tail is no longer
7641 * interesting, as it's no longer contiguous. */
7642 numfree = 0;
7643 }
7644 }
7645 if (vmFreePage(this)) {
7646 /* This is a free page */
7647 numfree++;
7648 /* Already got N free pages? Return to the caller, with success */
7649 if (numfree == n) {
7d30035d 7650 *first = this-(n-1);
7651 server.vm_next_page = this+1;
7c775e09 7652 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 7653 return REDIS_OK;
06224fec 7654 }
7655 } else {
7656 /* The current one is not a free page */
7657 numfree = 0;
7658 }
7659
7660 /* Fast-forward if the current page is not free and we already
7661 * searched enough near this place. */
7662 since_jump++;
7663 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7664 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7665 since_jump = 0;
7666 /* Note that even if we rewind after the jump, we are don't need
7667 * to make sure numfree is set to zero as we only jump *if* it
7668 * is set to zero. */
7669 } else {
7670 /* Otherwise just check the next page */
7671 offset++;
7672 }
7673 }
3a66edc7 7674 return REDIS_ERR;
7675}
7676
a5819310 7677/* Write the specified object at the specified page of the swap file */
7678static int vmWriteObjectOnSwap(robj *o, off_t page) {
7679 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7680 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7681 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7682 redisLog(REDIS_WARNING,
9ebed7cf 7683 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 7684 strerror(errno));
7685 return REDIS_ERR;
7686 }
7687 rdbSaveObject(server.vm_fp,o);
ba76a8f9 7688 fflush(server.vm_fp);
a5819310 7689 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7690 return REDIS_OK;
7691}
7692
3a66edc7 7693/* Swap the 'val' object relative to 'key' into disk. Store all the information
7694 * needed to later retrieve the object into the key object.
7695 * If we can't find enough contiguous empty pages to swap the object on disk
7696 * REDIS_ERR is returned. */
a69a0c9c 7697static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 7698 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 7699 off_t page;
7700
7701 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 7702 assert(key->refcount == 1);
3a66edc7 7703 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 7704 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 7705 key->vm.page = page;
7706 key->vm.usedpages = pages;
7707 key->storage = REDIS_VM_SWAPPED;
d894161b 7708 key->vtype = val->type;
3a66edc7 7709 decrRefCount(val); /* Deallocate the object from memory. */
7710 vmMarkPagesUsed(page,pages);
7d30035d 7711 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7712 (unsigned char*) key->ptr,
7713 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 7714 server.vm_stats_swapped_objects++;
7715 server.vm_stats_swapouts++;
3a66edc7 7716 return REDIS_OK;
7717}
7718
a5819310 7719static robj *vmReadObjectFromSwap(off_t page, int type) {
7720 robj *o;
3a66edc7 7721
a5819310 7722 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7723 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 7724 redisLog(REDIS_WARNING,
d5d55fc3 7725 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 7726 strerror(errno));
478c2c6f 7727 _exit(1);
3a66edc7 7728 }
a5819310 7729 o = rdbLoadObject(type,server.vm_fp);
7730 if (o == NULL) {
d5d55fc3 7731 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 7732 _exit(1);
3a66edc7 7733 }
a5819310 7734 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7735 return o;
7736}
7737
7738/* Load the value object relative to the 'key' object from swap to memory.
7739 * The newly allocated object is returned.
7740 *
7741 * If preview is true the unserialized object is returned to the caller but
7742 * no changes are made to the key object, nor the pages are marked as freed */
7743static robj *vmGenericLoadObject(robj *key, int preview) {
7744 robj *val;
7745
d5d55fc3 7746 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 7747 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 7748 if (!preview) {
7749 key->storage = REDIS_VM_MEMORY;
7750 key->vm.atime = server.unixtime;
7751 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7752 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7753 (unsigned char*) key->ptr);
7d98e08c 7754 server.vm_stats_swapped_objects--;
38aba9a1 7755 } else {
7756 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7757 (unsigned char*) key->ptr);
7e69548d 7758 }
7d98e08c 7759 server.vm_stats_swapins++;
3a66edc7 7760 return val;
06224fec 7761}
7762
7e69548d 7763/* Plain object loading, from swap to memory */
7764static robj *vmLoadObject(robj *key) {
996cb5f7 7765 /* If we are loading the object in background, stop it, we
7766 * need to load this object synchronously ASAP. */
7767 if (key->storage == REDIS_VM_LOADING)
7768 vmCancelThreadedIOJob(key);
7e69548d 7769 return vmGenericLoadObject(key,0);
7770}
7771
7772/* Just load the value on disk, without to modify the key.
7773 * This is useful when we want to perform some operation on the value
7774 * without to really bring it from swap to memory, like while saving the
7775 * dataset or rewriting the append only log. */
7776static robj *vmPreviewObject(robj *key) {
7777 return vmGenericLoadObject(key,1);
7778}
7779
4ef8de8a 7780/* How a good candidate is this object for swapping?
7781 * The better candidate it is, the greater the returned value.
7782 *
7783 * Currently we try to perform a fast estimation of the object size in
7784 * memory, and combine it with aging informations.
7785 *
7786 * Basically swappability = idle-time * log(estimated size)
7787 *
7788 * Bigger objects are preferred over smaller objects, but not
7789 * proportionally, this is why we use the logarithm. This algorithm is
7790 * just a first try and will probably be tuned later. */
7791static double computeObjectSwappability(robj *o) {
7792 time_t age = server.unixtime - o->vm.atime;
7793 long asize = 0;
7794 list *l;
7795 dict *d;
7796 struct dictEntry *de;
7797 int z;
7798
7799 if (age <= 0) return 0;
7800 switch(o->type) {
7801 case REDIS_STRING:
7802 if (o->encoding != REDIS_ENCODING_RAW) {
7803 asize = sizeof(*o);
7804 } else {
7805 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7806 }
7807 break;
7808 case REDIS_LIST:
7809 l = o->ptr;
7810 listNode *ln = listFirst(l);
7811
7812 asize = sizeof(list);
7813 if (ln) {
7814 robj *ele = ln->value;
7815 long elesize;
7816
7817 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7818 (sizeof(*o)+sdslen(ele->ptr)) :
7819 sizeof(*o);
7820 asize += (sizeof(listNode)+elesize)*listLength(l);
7821 }
7822 break;
7823 case REDIS_SET:
7824 case REDIS_ZSET:
7825 z = (o->type == REDIS_ZSET);
7826 d = z ? ((zset*)o->ptr)->dict : o->ptr;
7827
7828 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
7829 if (z) asize += sizeof(zset)-sizeof(dict);
7830 if (dictSize(d)) {
7831 long elesize;
7832 robj *ele;
7833
7834 de = dictGetRandomKey(d);
7835 ele = dictGetEntryKey(de);
7836 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7837 (sizeof(*o)+sdslen(ele->ptr)) :
7838 sizeof(*o);
7839 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
7840 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
7841 }
7842 break;
7843 }
c8c72447 7844 return (double)age*log(1+asize);
4ef8de8a 7845}
7846
7847/* Try to swap an object that's a good candidate for swapping.
7848 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 7849 * to swap any object at all.
7850 *
7851 * If 'usethreaded' is true, Redis will try to swap the object in background
7852 * using I/O threads. */
7853static int vmSwapOneObject(int usethreads) {
4ef8de8a 7854 int j, i;
7855 struct dictEntry *best = NULL;
7856 double best_swappability = 0;
b9bc0eef 7857 redisDb *best_db = NULL;
4ef8de8a 7858 robj *key, *val;
7859
7860 for (j = 0; j < server.dbnum; j++) {
7861 redisDb *db = server.db+j;
b72f6a4b 7862 /* Why maxtries is set to 100?
7863 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7864 * are swappable objects */
b0d8747d 7865 int maxtries = 100;
4ef8de8a 7866
7867 if (dictSize(db->dict) == 0) continue;
7868 for (i = 0; i < 5; i++) {
7869 dictEntry *de;
7870 double swappability;
7871
e3cadb8a 7872 if (maxtries) maxtries--;
4ef8de8a 7873 de = dictGetRandomKey(db->dict);
7874 key = dictGetEntryKey(de);
7875 val = dictGetEntryVal(de);
1064ef87 7876 /* Only swap objects that are currently in memory.
7877 *
7878 * Also don't swap shared objects if threaded VM is on, as we
7879 * try to ensure that the main thread does not touch the
7880 * object while the I/O thread is using it, but we can't
7881 * control other keys without adding additional mutex. */
7882 if (key->storage != REDIS_VM_MEMORY ||
7883 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 7884 if (maxtries) i--; /* don't count this try */
7885 continue;
7886 }
4ef8de8a 7887 swappability = computeObjectSwappability(val);
7888 if (!best || swappability > best_swappability) {
7889 best = de;
7890 best_swappability = swappability;
b9bc0eef 7891 best_db = db;
4ef8de8a 7892 }
7893 }
7894 }
7c775e09 7895 if (best == NULL) return REDIS_ERR;
4ef8de8a 7896 key = dictGetEntryKey(best);
7897 val = dictGetEntryVal(best);
7898
e3cadb8a 7899 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 7900 key->ptr, best_swappability);
7901
7902 /* Unshare the key if needed */
7903 if (key->refcount > 1) {
7904 robj *newkey = dupStringObject(key);
7905 decrRefCount(key);
7906 key = dictGetEntryKey(best) = newkey;
7907 }
7908 /* Swap it */
a69a0c9c 7909 if (usethreads) {
b9bc0eef 7910 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 7911 return REDIS_OK;
7912 } else {
a69a0c9c 7913 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7914 dictGetEntryVal(best) = NULL;
7915 return REDIS_OK;
7916 } else {
7917 return REDIS_ERR;
7918 }
4ef8de8a 7919 }
7920}
7921
a69a0c9c 7922static int vmSwapOneObjectBlocking() {
7923 return vmSwapOneObject(0);
7924}
7925
7926static int vmSwapOneObjectThreaded() {
7927 return vmSwapOneObject(1);
7928}
7929
7e69548d 7930/* Return true if it's safe to swap out objects in a given moment.
7931 * Basically we don't want to swap objects out while there is a BGSAVE
7932 * or a BGAEOREWRITE running in backgroud. */
7933static int vmCanSwapOut(void) {
7934 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
7935}
7936
1b03836c 7937/* Delete a key if swapped. Returns 1 if the key was found, was swapped
7938 * and was deleted. Otherwise 0 is returned. */
7939static int deleteIfSwapped(redisDb *db, robj *key) {
7940 dictEntry *de;
7941 robj *foundkey;
7942
7943 if ((de = dictFind(db->dict,key)) == NULL) return 0;
7944 foundkey = dictGetEntryKey(de);
7945 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
7946 deleteKey(db,key);
7947 return 1;
7948}
7949
996cb5f7 7950/* =================== Virtual Memory - Threaded I/O ======================= */
7951
b9bc0eef 7952static void freeIOJob(iojob *j) {
d5d55fc3 7953 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
7954 j->type == REDIS_IOJOB_DO_SWAP ||
7955 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 7956 decrRefCount(j->val);
7957 decrRefCount(j->key);
7958 zfree(j);
7959}
7960
996cb5f7 7961/* Every time a thread finished a Job, it writes a byte into the write side
7962 * of an unix pipe in order to "awake" the main thread, and this function
7963 * is called. */
7964static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
7965 int mask)
7966{
7967 char buf[1];
b0d8747d 7968 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 7969 REDIS_NOTUSED(el);
7970 REDIS_NOTUSED(mask);
7971 REDIS_NOTUSED(privdata);
7972
7973 /* For every byte we read in the read side of the pipe, there is one
7974 * I/O job completed to process. */
7975 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 7976 iojob *j;
7977 listNode *ln;
7978 robj *key;
7979 struct dictEntry *de;
7980
996cb5f7 7981 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 7982
7983 /* Get the processed element (the oldest one) */
7984 lockThreadedIO();
1064ef87 7985 assert(listLength(server.io_processed) != 0);
f6c0bba8 7986 if (toprocess == -1) {
7987 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
7988 if (toprocess <= 0) toprocess = 1;
7989 }
b9bc0eef 7990 ln = listFirst(server.io_processed);
7991 j = ln->value;
7992 listDelNode(server.io_processed,ln);
7993 unlockThreadedIO();
7994 /* If this job is marked as canceled, just ignore it */
7995 if (j->canceled) {
7996 freeIOJob(j);
7997 continue;
7998 }
7999 /* Post process it in the main thread, as there are things we
8000 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 8001 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 8002 de = dictFind(j->db->dict,j->key);
8003 assert(de != NULL);
8004 key = dictGetEntryKey(de);
8005 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8006 redisDb *db;
8007
b9bc0eef 8008 /* Key loaded, bring it at home */
8009 key->storage = REDIS_VM_MEMORY;
8010 key->vm.atime = server.unixtime;
8011 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8012 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8013 (unsigned char*) key->ptr);
8014 server.vm_stats_swapped_objects--;
8015 server.vm_stats_swapins++;
d5d55fc3 8016 dictGetEntryVal(de) = j->val;
8017 incrRefCount(j->val);
8018 db = j->db;
b9bc0eef 8019 freeIOJob(j);
d5d55fc3 8020 /* Handle clients waiting for this key to be loaded. */
8021 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 8022 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8023 /* Now we know the amount of pages required to swap this object.
8024 * Let's find some space for it, and queue this task again
8025 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 8026 if (!vmCanSwapOut() ||
8027 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8028 {
8029 /* Ooops... no space or we can't swap as there is
8030 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 8031 freeIOJob(j);
054e426d 8032 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 8033 } else {
c7df85a4 8034 /* Note that we need to mark this pages as used now,
8035 * if the job will be canceled, we'll mark them as freed
8036 * again. */
8037 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 8038 j->type = REDIS_IOJOB_DO_SWAP;
8039 lockThreadedIO();
8040 queueIOJob(j);
8041 unlockThreadedIO();
8042 }
8043 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8044 robj *val;
8045
8046 /* Key swapped. We can finally free some memory. */
6c96ba7d 8047 if (key->storage != REDIS_VM_SWAPPING) {
8048 printf("key->storage: %d\n",key->storage);
8049 printf("key->name: %s\n",(char*)key->ptr);
8050 printf("key->refcount: %d\n",key->refcount);
8051 printf("val: %p\n",(void*)j->val);
8052 printf("val->type: %d\n",j->val->type);
8053 printf("val->ptr: %s\n",(char*)j->val->ptr);
8054 }
8055 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 8056 val = dictGetEntryVal(de);
8057 key->vm.page = j->page;
8058 key->vm.usedpages = j->pages;
8059 key->storage = REDIS_VM_SWAPPED;
8060 key->vtype = j->val->type;
8061 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 8062 dictGetEntryVal(de) = NULL;
b9bc0eef 8063 redisLog(REDIS_DEBUG,
8064 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8065 (unsigned char*) key->ptr,
8066 (unsigned long long) j->page, (unsigned long long) j->pages);
8067 server.vm_stats_swapped_objects++;
8068 server.vm_stats_swapouts++;
8069 freeIOJob(j);
f11b8647 8070 /* Put a few more swap requests in queue if we are still
8071 * out of memory */
b0d8747d 8072 if (trytoswap && vmCanSwapOut() &&
8073 zmalloc_used_memory() > server.vm_max_memory)
8074 {
f11b8647 8075 int more = 1;
8076 while(more) {
8077 lockThreadedIO();
8078 more = listLength(server.io_newjobs) <
8079 (unsigned) server.vm_max_threads;
8080 unlockThreadedIO();
8081 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 8082 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8083 trytoswap = 0;
8084 break;
8085 }
f11b8647 8086 }
8087 }
b9bc0eef 8088 }
c953f24b 8089 processed++;
f6c0bba8 8090 if (processed == toprocess) return;
996cb5f7 8091 }
8092 if (retval < 0 && errno != EAGAIN) {
8093 redisLog(REDIS_WARNING,
8094 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8095 strerror(errno));
8096 }
8097}
8098
8099static void lockThreadedIO(void) {
8100 pthread_mutex_lock(&server.io_mutex);
8101}
8102
8103static void unlockThreadedIO(void) {
8104 pthread_mutex_unlock(&server.io_mutex);
8105}
8106
8107/* Remove the specified object from the threaded I/O queue if still not
8108 * processed, otherwise make sure to flag it as canceled. */
8109static void vmCancelThreadedIOJob(robj *o) {
8110 list *lists[3] = {
6c96ba7d 8111 server.io_newjobs, /* 0 */
8112 server.io_processing, /* 1 */
8113 server.io_processed /* 2 */
996cb5f7 8114 };
8115 int i;
8116
8117 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 8118again:
996cb5f7 8119 lockThreadedIO();
8120 /* Search for a matching key in one of the queues */
8121 for (i = 0; i < 3; i++) {
8122 listNode *ln;
c7df85a4 8123 listIter li;
996cb5f7 8124
c7df85a4 8125 listRewind(lists[i],&li);
8126 while ((ln = listNext(&li)) != NULL) {
996cb5f7 8127 iojob *job = ln->value;
8128
6c96ba7d 8129 if (job->canceled) continue; /* Skip this, already canceled. */
996cb5f7 8130 if (compareStringObjects(job->key,o) == 0) {
970e10bb 8131 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8132 (void*)job, (char*)o->ptr, job->type, i);
427a2153 8133 /* Mark the pages as free since the swap didn't happened
8134 * or happened but is now discarded. */
970e10bb 8135 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 8136 vmMarkPagesFree(job->page,job->pages);
8137 /* Cancel the job. It depends on the list the job is
8138 * living in. */
996cb5f7 8139 switch(i) {
8140 case 0: /* io_newjobs */
6c96ba7d 8141 /* If the job was yet not processed the best thing to do
996cb5f7 8142 * is to remove it from the queue at all */
6c96ba7d 8143 freeIOJob(job);
996cb5f7 8144 listDelNode(lists[i],ln);
8145 break;
8146 case 1: /* io_processing */
d5d55fc3 8147 /* Oh Shi- the thread is messing with the Job:
8148 *
8149 * Probably it's accessing the object if this is a
8150 * PREPARE_SWAP or DO_SWAP job.
8151 * If it's a LOAD job it may be reading from disk and
8152 * if we don't wait for the job to terminate before to
8153 * cancel it, maybe in a few microseconds data can be
8154 * corrupted in this pages. So the short story is:
8155 *
8156 * Better to wait for the job to move into the
8157 * next queue (processed)... */
8158
8159 /* We try again and again until the job is completed. */
8160 unlockThreadedIO();
8161 /* But let's wait some time for the I/O thread
8162 * to finish with this job. After all this condition
8163 * should be very rare. */
8164 usleep(1);
8165 goto again;
996cb5f7 8166 case 2: /* io_processed */
2e111efe 8167 /* The job was already processed, that's easy...
8168 * just mark it as canceled so that we'll ignore it
8169 * when processing completed jobs. */
996cb5f7 8170 job->canceled = 1;
8171 break;
8172 }
c7df85a4 8173 /* Finally we have to adjust the storage type of the object
8174 * in order to "UNDO" the operaiton. */
996cb5f7 8175 if (o->storage == REDIS_VM_LOADING)
8176 o->storage = REDIS_VM_SWAPPED;
8177 else if (o->storage == REDIS_VM_SWAPPING)
8178 o->storage = REDIS_VM_MEMORY;
8179 unlockThreadedIO();
8180 return;
8181 }
8182 }
8183 }
8184 unlockThreadedIO();
8185 assert(1 != 1); /* We should never reach this */
8186}
8187
b9bc0eef 8188static void *IOThreadEntryPoint(void *arg) {
8189 iojob *j;
8190 listNode *ln;
8191 REDIS_NOTUSED(arg);
8192
8193 pthread_detach(pthread_self());
8194 while(1) {
8195 /* Get a new job to process */
8196 lockThreadedIO();
8197 if (listLength(server.io_newjobs) == 0) {
8198 /* No new jobs in queue, exit. */
9ebed7cf 8199 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8200 (long) pthread_self());
b9bc0eef 8201 server.io_active_threads--;
8202 unlockThreadedIO();
8203 return NULL;
8204 }
8205 ln = listFirst(server.io_newjobs);
8206 j = ln->value;
8207 listDelNode(server.io_newjobs,ln);
8208 /* Add the job in the processing queue */
8209 j->thread = pthread_self();
8210 listAddNodeTail(server.io_processing,j);
8211 ln = listLast(server.io_processing); /* We use ln later to remove it */
8212 unlockThreadedIO();
9ebed7cf 8213 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8214 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 8215
8216 /* Process the Job */
8217 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8218 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 8219 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8220 FILE *fp = fopen("/dev/null","w+");
8221 j->pages = rdbSavedObjectPages(j->val,fp);
8222 fclose(fp);
8223 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 8224 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8225 j->canceled = 1;
b9bc0eef 8226 }
8227
8228 /* Done: insert the job into the processed queue */
9ebed7cf 8229 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8230 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 8231 lockThreadedIO();
8232 listDelNode(server.io_processing,ln);
8233 listAddNodeTail(server.io_processed,j);
8234 unlockThreadedIO();
8235
8236 /* Signal the main thread there is new stuff to process */
8237 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8238 }
8239 return NULL; /* never reached */
8240}
8241
8242static void spawnIOThread(void) {
8243 pthread_t thread;
478c2c6f 8244 sigset_t mask, omask;
b9bc0eef 8245
478c2c6f 8246 sigemptyset(&mask);
8247 sigaddset(&mask,SIGCHLD);
8248 sigaddset(&mask,SIGHUP);
8249 sigaddset(&mask,SIGPIPE);
8250 pthread_sigmask(SIG_SETMASK, &mask, &omask);
bcaa7a4f 8251 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
478c2c6f 8252 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 8253 server.io_active_threads++;
8254}
8255
4ee9488d 8256/* We need to wait for the last thread to exit before we are able to
8257 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 8258static void waitEmptyIOJobsQueue(void) {
4ee9488d 8259 while(1) {
76b7233a 8260 int io_processed_len;
8261
4ee9488d 8262 lockThreadedIO();
054e426d 8263 if (listLength(server.io_newjobs) == 0 &&
8264 listLength(server.io_processing) == 0 &&
8265 server.io_active_threads == 0)
8266 {
4ee9488d 8267 unlockThreadedIO();
8268 return;
8269 }
76b7233a 8270 /* While waiting for empty jobs queue condition we post-process some
8271 * finshed job, as I/O threads may be hanging trying to write against
8272 * the io_ready_pipe_write FD but there are so much pending jobs that
8273 * it's blocking. */
8274 io_processed_len = listLength(server.io_processed);
4ee9488d 8275 unlockThreadedIO();
76b7233a 8276 if (io_processed_len) {
8277 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8278 usleep(1000); /* 1 millisecond */
8279 } else {
8280 usleep(10000); /* 10 milliseconds */
8281 }
4ee9488d 8282 }
8283}
8284
054e426d 8285static void vmReopenSwapFile(void) {
478c2c6f 8286 /* Note: we don't close the old one as we are in the child process
8287 * and don't want to mess at all with the original file object. */
054e426d 8288 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8289 if (server.vm_fp == NULL) {
8290 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8291 server.vm_swap_file);
478c2c6f 8292 _exit(1);
054e426d 8293 }
8294 server.vm_fd = fileno(server.vm_fp);
8295}
8296
b9bc0eef 8297/* This function must be called while with threaded IO locked */
8298static void queueIOJob(iojob *j) {
6c96ba7d 8299 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8300 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 8301 listAddNodeTail(server.io_newjobs,j);
8302 if (server.io_active_threads < server.vm_max_threads)
8303 spawnIOThread();
8304}
8305
8306static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8307 iojob *j;
8308
8309 assert(key->storage == REDIS_VM_MEMORY);
8310 assert(key->refcount == 1);
8311
8312 j = zmalloc(sizeof(*j));
8313 j->type = REDIS_IOJOB_PREPARE_SWAP;
8314 j->db = db;
8315 j->key = dupStringObject(key);
8316 j->val = val;
8317 incrRefCount(val);
8318 j->canceled = 0;
8319 j->thread = (pthread_t) -1;
f11b8647 8320 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 8321
8322 lockThreadedIO();
8323 queueIOJob(j);
8324 unlockThreadedIO();
8325 return REDIS_OK;
8326}
8327
b0d8747d 8328/* ============ Virtual Memory - Blocking clients on missing keys =========== */
8329
d5d55fc3 8330/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8331 * If there is not already a job loading the key, it is craeted.
8332 * The key is added to the io_keys list in the client structure, and also
8333 * in the hash table mapping swapped keys to waiting clients, that is,
8334 * server.io_waited_keys. */
8335static int waitForSwappedKey(redisClient *c, robj *key) {
8336 struct dictEntry *de;
8337 robj *o;
8338 list *l;
8339
8340 /* If the key does not exist or is already in RAM we don't need to
8341 * block the client at all. */
8342 de = dictFind(c->db->dict,key);
8343 if (de == NULL) return 0;
8344 o = dictGetEntryKey(de);
8345 if (o->storage == REDIS_VM_MEMORY) {
8346 return 0;
8347 } else if (o->storage == REDIS_VM_SWAPPING) {
8348 /* We were swapping the key, undo it! */
8349 vmCancelThreadedIOJob(o);
8350 return 0;
8351 }
8352
8353 /* OK: the key is either swapped, or being loaded just now. */
8354
8355 /* Add the key to the list of keys this client is waiting for.
8356 * This maps clients to keys they are waiting for. */
8357 listAddNodeTail(c->io_keys,key);
8358 incrRefCount(key);
8359
8360 /* Add the client to the swapped keys => clients waiting map. */
8361 de = dictFind(c->db->io_keys,key);
8362 if (de == NULL) {
8363 int retval;
8364
8365 /* For every key we take a list of clients blocked for it */
8366 l = listCreate();
8367 retval = dictAdd(c->db->io_keys,key,l);
8368 incrRefCount(key);
8369 assert(retval == DICT_OK);
8370 } else {
8371 l = dictGetEntryVal(de);
8372 }
8373 listAddNodeTail(l,c);
8374
8375 /* Are we already loading the key from disk? If not create a job */
8376 if (o->storage == REDIS_VM_SWAPPED) {
8377 iojob *j;
8378
8379 o->storage = REDIS_VM_LOADING;
8380 j = zmalloc(sizeof(*j));
8381 j->type = REDIS_IOJOB_LOAD;
8382 j->db = c->db;
8383 j->key = dupStringObject(key);
8384 j->key->vtype = o->vtype;
8385 j->page = o->vm.page;
8386 j->val = NULL;
8387 j->canceled = 0;
8388 j->thread = (pthread_t) -1;
8389 lockThreadedIO();
8390 queueIOJob(j);
8391 unlockThreadedIO();
8392 }
8393 return 1;
8394}
8395
b0d8747d 8396/* Is this client attempting to run a command against swapped keys?
d5d55fc3 8397 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 8398 *
d5d55fc3 8399 * The important idea about this function is that it can fail! If keys will
8400 * still be swapped when the client is resumed, this key lookups will
8401 * just block loading keys from disk. In practical terms this should only
8402 * happen with SORT BY command or if there is a bug in this function.
8403 *
8404 * Return 1 if the client is marked as blocked, 0 if the client can
8405 * continue as the keys it is going to access appear to be in memory. */
8406static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 8407 int j, last;
8408
8409 if (cmd->vm_firstkey == 0) return 0;
8410 last = cmd->vm_lastkey;
8411 if (last < 0) last = c->argc+last;
8412 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8413 waitForSwappedKey(c,c->argv[j]);
d5d55fc3 8414 /* If the client was blocked for at least one key, mark it as blocked. */
8415 if (listLength(c->io_keys)) {
8416 c->flags |= REDIS_IO_WAIT;
8417 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8418 server.vm_blocked_clients++;
8419 return 1;
8420 } else {
8421 return 0;
8422 }
8423}
8424
8425/* Remove the 'key' from the list of blocked keys for a given client.
8426 *
8427 * The function returns 1 when there are no longer blocking keys after
8428 * the current one was removed (and the client can be unblocked). */
8429static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8430 list *l;
8431 listNode *ln;
8432 listIter li;
8433 struct dictEntry *de;
8434
8435 /* Remove the key from the list of keys this client is waiting for. */
8436 listRewind(c->io_keys,&li);
8437 while ((ln = listNext(&li)) != NULL) {
8438 if (compareStringObjects(ln->value,key) == 0) {
8439 listDelNode(c->io_keys,ln);
8440 break;
8441 }
8442 }
8443 assert(ln != NULL);
8444
8445 /* Remove the client form the key => waiting clients map. */
8446 de = dictFind(c->db->io_keys,key);
8447 assert(de != NULL);
8448 l = dictGetEntryVal(de);
8449 ln = listSearchKey(l,c);
8450 assert(ln != NULL);
8451 listDelNode(l,ln);
8452 if (listLength(l) == 0)
8453 dictDelete(c->db->io_keys,key);
8454
8455 return listLength(c->io_keys) == 0;
8456}
8457
8458static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8459 struct dictEntry *de;
8460 list *l;
8461 listNode *ln;
8462 int len;
8463
8464 de = dictFind(db->io_keys,key);
8465 if (!de) return;
8466
8467 l = dictGetEntryVal(de);
8468 len = listLength(l);
8469 /* Note: we can't use something like while(listLength(l)) as the list
8470 * can be freed by the calling function when we remove the last element. */
8471 while (len--) {
8472 ln = listFirst(l);
8473 redisClient *c = ln->value;
8474
8475 if (dontWaitForSwappedKey(c,key)) {
8476 /* Put the client in the list of clients ready to go as we
8477 * loaded all the keys about it. */
8478 listAddNodeTail(server.io_ready_clients,c);
8479 }
8480 }
b0d8747d 8481}
b0d8747d 8482
7f957c92 8483/* ================================= Debugging ============================== */
8484
8485static void debugCommand(redisClient *c) {
8486 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
8487 *((char*)-1) = 'x';
210e29f7 8488 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
8489 if (rdbSave(server.dbfilename) != REDIS_OK) {
8490 addReply(c,shared.err);
8491 return;
8492 }
8493 emptyDb();
8494 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8495 addReply(c,shared.err);
8496 return;
8497 }
8498 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
8499 addReply(c,shared.ok);
71c2b467 8500 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
8501 emptyDb();
8502 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
8503 addReply(c,shared.err);
8504 return;
8505 }
8506 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
8507 addReply(c,shared.ok);
333298da 8508 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
8509 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8510 robj *key, *val;
8511
8512 if (!de) {
8513 addReply(c,shared.nokeyerr);
8514 return;
8515 }
8516 key = dictGetEntryKey(de);
8517 val = dictGetEntryVal(de);
59146ef3 8518 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
8519 key->storage == REDIS_VM_SWAPPING)) {
ace06542 8520 addReplySds(c,sdscatprintf(sdsempty(),
8521 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8522 "encoding:%d serializedlength:%lld\r\n",
682ac724 8523 (void*)key, key->refcount, (void*)val, val->refcount,
459f52a8 8524 val->encoding, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 8525 } else {
8526 addReplySds(c,sdscatprintf(sdsempty(),
8527 "+Key at:%p refcount:%d, value swapped at: page %llu "
8528 "using %llu pages\r\n",
8529 (void*)key, key->refcount, (unsigned long long) key->vm.page,
8530 (unsigned long long) key->vm.usedpages));
8531 }
7d30035d 8532 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
8533 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8534 robj *key, *val;
8535
8536 if (!server.vm_enabled) {
8537 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8538 return;
8539 }
8540 if (!de) {
8541 addReply(c,shared.nokeyerr);
8542 return;
8543 }
8544 key = dictGetEntryKey(de);
8545 val = dictGetEntryVal(de);
4ef8de8a 8546 /* If the key is shared we want to create a copy */
8547 if (key->refcount > 1) {
8548 robj *newkey = dupStringObject(key);
8549 decrRefCount(key);
8550 key = dictGetEntryKey(de) = newkey;
8551 }
8552 /* Swap it */
7d30035d 8553 if (key->storage != REDIS_VM_MEMORY) {
8554 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 8555 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 8556 dictGetEntryVal(de) = NULL;
8557 addReply(c,shared.ok);
8558 } else {
8559 addReply(c,shared.err);
8560 }
7f957c92 8561 } else {
333298da 8562 addReplySds(c,sdsnew(
7d30035d 8563 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 8564 }
8565}
56906eef 8566
6c96ba7d 8567static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 8568 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 8569 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 8570#ifdef HAVE_BACKTRACE
8571 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
8572 *((char*)-1) = 'x';
8573#endif
8574}
8575
bcfc686d 8576/* =================================== Main! ================================ */
56906eef 8577
bcfc686d 8578#ifdef __linux__
8579int linuxOvercommitMemoryValue(void) {
8580 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
8581 char buf[64];
56906eef 8582
bcfc686d 8583 if (!fp) return -1;
8584 if (fgets(buf,64,fp) == NULL) {
8585 fclose(fp);
8586 return -1;
8587 }
8588 fclose(fp);
56906eef 8589
bcfc686d 8590 return atoi(buf);
8591}
8592
8593void linuxOvercommitMemoryWarning(void) {
8594 if (linuxOvercommitMemoryValue() == 0) {
8595 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8596 }
8597}
8598#endif /* __linux__ */
8599
8600static void daemonize(void) {
8601 int fd;
8602 FILE *fp;
8603
8604 if (fork() != 0) exit(0); /* parent exits */
8605 setsid(); /* create a new session */
8606
8607 /* Every output goes to /dev/null. If Redis is daemonized but
8608 * the 'logfile' is set to 'stdout' in the configuration file
8609 * it will not log at all. */
8610 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
8611 dup2(fd, STDIN_FILENO);
8612 dup2(fd, STDOUT_FILENO);
8613 dup2(fd, STDERR_FILENO);
8614 if (fd > STDERR_FILENO) close(fd);
8615 }
8616 /* Try to write the pid file */
8617 fp = fopen(server.pidfile,"w");
8618 if (fp) {
8619 fprintf(fp,"%d\n",getpid());
8620 fclose(fp);
56906eef 8621 }
56906eef 8622}
8623
bcfc686d 8624int main(int argc, char **argv) {
9651a787 8625 time_t start;
8626
bcfc686d 8627 initServerConfig();
8628 if (argc == 2) {
8629 resetServerSaveParams();
8630 loadServerConfig(argv[1]);
8631 } else if (argc > 2) {
8632 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
8633 exit(1);
8634 } else {
8635 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8636 }
bcfc686d 8637 if (server.daemonize) daemonize();
71c54b21 8638 initServer();
bcfc686d 8639 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
8640#ifdef __linux__
8641 linuxOvercommitMemoryWarning();
8642#endif
9651a787 8643 start = time(NULL);
bcfc686d 8644 if (server.appendonly) {
8645 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 8646 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 8647 } else {
8648 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 8649 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 8650 }
bcfc686d 8651 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 8652 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 8653 aeMain(server.el);
8654 aeDeleteEventLoop(server.el);
8655 return 0;
8656}
8657
8658/* ============================= Backtrace support ========================= */
8659
8660#ifdef HAVE_BACKTRACE
8661static char *findFuncName(void *pointer, unsigned long *offset);
8662
56906eef 8663static void *getMcontextEip(ucontext_t *uc) {
8664#if defined(__FreeBSD__)
8665 return (void*) uc->uc_mcontext.mc_eip;
8666#elif defined(__dietlibc__)
8667 return (void*) uc->uc_mcontext.eip;
06db1f50 8668#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 8669 #if __x86_64__
8670 return (void*) uc->uc_mcontext->__ss.__rip;
8671 #else
56906eef 8672 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 8673 #endif
06db1f50 8674#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 8675 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 8676 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 8677 #else
8678 return (void*) uc->uc_mcontext->__ss.__eip;
8679 #endif
54bac49d 8680#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 8681 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 8682#elif defined(__ia64__) /* Linux IA64 */
8683 return (void*) uc->uc_mcontext.sc_ip;
8684#else
8685 return NULL;
56906eef 8686#endif
8687}
8688
8689static void segvHandler(int sig, siginfo_t *info, void *secret) {
8690 void *trace[100];
8691 char **messages = NULL;
8692 int i, trace_size = 0;
8693 unsigned long offset=0;
56906eef 8694 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 8695 sds infostring;
56906eef 8696 REDIS_NOTUSED(info);
8697
8698 redisLog(REDIS_WARNING,
8699 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 8700 infostring = genRedisInfoString();
8701 redisLog(REDIS_WARNING, "%s",infostring);
8702 /* It's not safe to sdsfree() the returned string under memory
8703 * corruption conditions. Let it leak as we are going to abort */
56906eef 8704
8705 trace_size = backtrace(trace, 100);
de96dbfe 8706 /* overwrite sigaction with caller's address */
b91cf5ef 8707 if (getMcontextEip(uc) != NULL) {
8708 trace[1] = getMcontextEip(uc);
8709 }
56906eef 8710 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 8711
d76412d1 8712 for (i=1; i<trace_size; ++i) {
56906eef 8713 char *fn = findFuncName(trace[i], &offset), *p;
8714
8715 p = strchr(messages[i],'+');
8716 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
8717 redisLog(REDIS_WARNING,"%s", messages[i]);
8718 } else {
8719 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
8720 }
8721 }
b177fd30 8722 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 8723 _exit(0);
fe3bbfbe 8724}
56906eef 8725
8726static void setupSigSegvAction(void) {
8727 struct sigaction act;
8728
8729 sigemptyset (&act.sa_mask);
8730 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8731 * is used. Otherwise, sa_handler is used */
8732 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
8733 act.sa_sigaction = segvHandler;
8734 sigaction (SIGSEGV, &act, NULL);
8735 sigaction (SIGBUS, &act, NULL);
12fea928 8736 sigaction (SIGFPE, &act, NULL);
8737 sigaction (SIGILL, &act, NULL);
8738 sigaction (SIGBUS, &act, NULL);
e65fdc78 8739 return;
56906eef 8740}
e65fdc78 8741
bcfc686d 8742#include "staticsymbols.h"
8743/* This function try to convert a pointer into a function name. It's used in
8744 * oreder to provide a backtrace under segmentation fault that's able to
8745 * display functions declared as static (otherwise the backtrace is useless). */
8746static char *findFuncName(void *pointer, unsigned long *offset){
8747 int i, ret = -1;
8748 unsigned long off, minoff = 0;
ed9b544e 8749
bcfc686d 8750 /* Try to match against the Symbol with the smallest offset */
8751 for (i=0; symsTable[i].pointer; i++) {
8752 unsigned long lp = (unsigned long) pointer;
0bc03378 8753
bcfc686d 8754 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
8755 off=lp-symsTable[i].pointer;
8756 if (ret < 0 || off < minoff) {
8757 minoff=off;
8758 ret=i;
8759 }
8760 }
0bc03378 8761 }
bcfc686d 8762 if (ret == -1) return NULL;
8763 *offset = minoff;
8764 return symsTable[ret].name;
0bc03378 8765}
bcfc686d 8766#else /* HAVE_BACKTRACE */
8767static void setupSigSegvAction(void) {
0bc03378 8768}
bcfc686d 8769#endif /* HAVE_BACKTRACE */
0bc03378 8770
ed9b544e 8771
ed9b544e 8772
bcfc686d 8773/* The End */
8774
8775
ed9b544e 8776