]> git.saurik.com Git - redis.git/blame - redis.c
An interesting refactoring + more expressive internal API
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
ed745154 30#define REDIS_VERSION "1.3.4"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
54bac49d 41#define __USE_UNIX98
ed9b544e 42#include <signal.h>
fbf9bcdb 43
44#ifdef HAVE_BACKTRACE
c9468bcf 45#include <execinfo.h>
46#include <ucontext.h>
fbf9bcdb 47#endif /* HAVE_BACKTRACE */
48
ed9b544e 49#include <sys/wait.h>
50#include <errno.h>
51#include <assert.h>
52#include <ctype.h>
53#include <stdarg.h>
54#include <inttypes.h>
55#include <arpa/inet.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#include <sys/time.h>
59#include <sys/resource.h>
2895e862 60#include <sys/uio.h>
f78fd11b 61#include <limits.h>
a7866db6 62#include <math.h>
92f8e882 63#include <pthread.h>
0bc1b2f6 64
65#if defined(__sun)
5043dff3 66#include "solarisfixes.h"
67#endif
ed9b544e 68
c9468bcf 69#include "redis.h"
ed9b544e 70#include "ae.h" /* Event driven programming library */
71#include "sds.h" /* Dynamic safe strings */
72#include "anet.h" /* Networking the easy way */
73#include "dict.h" /* Hash tables */
74#include "adlist.h" /* Linked lists */
75#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 76#include "lzf.h" /* LZF compression library */
77#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 78#include "zipmap.h"
ed9b544e 79
80/* Error codes */
81#define REDIS_OK 0
82#define REDIS_ERR -1
83
84/* Static server configuration */
85#define REDIS_SERVERPORT 6379 /* TCP port */
86#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 87#define REDIS_IOBUF_LEN 1024
ed9b544e 88#define REDIS_LOADBUF_LEN 1024
93ea3759 89#define REDIS_STATIC_ARGS 4
ed9b544e 90#define REDIS_DEFAULT_DBNUM 16
91#define REDIS_CONFIGLINE_MAX 1024
92#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94754ccc 94#define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
6f376729 95#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 96#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99#define REDIS_WRITEV_THRESHOLD 3
100/* Max number of iovecs used for each writev call */
101#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 102
103/* Hash table parameters */
104#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 105
106/* Command flags */
3fd78bcd 107#define REDIS_CMD_BULK 1 /* Bulk write command */
108#define REDIS_CMD_INLINE 2 /* Inline command */
109/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113#define REDIS_CMD_DENYOOM 4
ed9b544e 114
115/* Object types */
116#define REDIS_STRING 0
117#define REDIS_LIST 1
118#define REDIS_SET 2
1812e024 119#define REDIS_ZSET 3
120#define REDIS_HASH 4
f78fd11b 121
5234952b 122/* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
942a3961 125#define REDIS_ENCODING_RAW 0 /* Raw representation */
126#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 127#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 129
07efaf74 130static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132};
133
f78fd11b 134/* Object types only used for dumping to disk */
bb32ede5 135#define REDIS_EXPIRETIME 253
ed9b544e 136#define REDIS_SELECTDB 254
137#define REDIS_EOF 255
138
f78fd11b 139/* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
f78fd11b 149 *
10c43610 150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
f78fd11b 152#define REDIS_RDB_6BITLEN 0
153#define REDIS_RDB_14BITLEN 1
154#define REDIS_RDB_32BITLEN 2
17be1a4a 155#define REDIS_RDB_ENCVAL 3
f78fd11b 156#define REDIS_RDB_LENERR UINT_MAX
157
a4d1ba9a 158/* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 164#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 165
75680a3c 166/* Virtual memory object->where field. */
167#define REDIS_VM_MEMORY 0 /* The object is on memory */
168#define REDIS_VM_SWAPPED 1 /* The object is on disk */
169#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
06224fec 172/* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174#define REDIS_VM_MAX_NEAR_PAGES 65536
175#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 176#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 177#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 178/* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
c953f24b 182#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 183
ed9b544e 184/* Client flags */
d5d55fc3 185#define REDIS_SLAVE 1 /* This client is a slave server */
186#define REDIS_MASTER 2 /* This client is a master server */
187#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188#define REDIS_MULTI 8 /* This client is in a MULTI context */
189#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 191
40d224a9 192/* Slave replication state - slave side */
ed9b544e 193#define REDIS_REPL_NONE 0 /* No active replication */
194#define REDIS_REPL_CONNECT 1 /* Must connect to master */
195#define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
40d224a9 197/* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
ed9b544e 206/* List related stuff */
207#define REDIS_HEAD 0
208#define REDIS_TAIL 1
209
210/* Sort operations */
211#define REDIS_SORT_GET 0
443c6409 212#define REDIS_SORT_ASC 1
213#define REDIS_SORT_DESC 2
ed9b544e 214#define REDIS_SORTKEY_MAX 1024
215
216/* Log levels */
217#define REDIS_DEBUG 0
f870935d 218#define REDIS_VERBOSE 1
219#define REDIS_NOTICE 2
220#define REDIS_WARNING 3
ed9b544e 221
222/* Anti-warning macro... */
223#define REDIS_NOTUSED(V) ((void) V)
224
6b47e12e 225#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 227
48f0308a 228/* Append only defines */
229#define APPENDFSYNC_NO 0
230#define APPENDFSYNC_ALWAYS 1
231#define APPENDFSYNC_EVERYSEC 2
232
cbba7dd7 233/* Hashes related defaults */
234#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
dfc5e96c 237/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 238#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
6c96ba7d 239static void _redisAssert(char *estr, char *file, int line);
dfc5e96c 240
ed9b544e 241/*================================= Data types ============================== */
242
243/* A redis object, that is a type able to hold a string / list / set */
75680a3c 244
245/* The VM object structure */
246struct redisObjectVM {
3a66edc7 247 off_t page; /* the page at witch the object is stored on disk */
248 off_t usedpages; /* number of pages used on disk */
249 time_t atime; /* Last access time */
75680a3c 250} vm;
251
252/* The actual Redis Object */
ed9b544e 253typedef struct redisObject {
ed9b544e 254 void *ptr;
942a3961 255 unsigned char type;
256 unsigned char encoding;
d894161b 257 unsigned char storage; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
ed9b544e 261 int refcount;
75680a3c 262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm;
ed9b544e 267} robj;
268
dfc5e96c 269/* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273#define initStaticStringObject(_var,_ptr) do { \
274 _var.refcount = 1; \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
277 _var.ptr = _ptr; \
3a66edc7 278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 279} while(0);
280
3305306f 281typedef struct redisDb {
4409877e 282 dict *dict; /* The keyspace for this DB */
283 dict *expires; /* Timeout of keys with a timeout set */
284 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 285 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 286 int id;
287} redisDb;
288
6e469882 289/* Client MULTI/EXEC state */
290typedef struct multiCmd {
291 robj **argv;
292 int argc;
293 struct redisCommand *cmd;
294} multiCmd;
295
296typedef struct multiState {
297 multiCmd *commands; /* Array of MULTI commands */
298 int count; /* Total number of MULTI commands */
299} multiState;
300
ed9b544e 301/* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303typedef struct redisClient {
304 int fd;
3305306f 305 redisDb *db;
ed9b544e 306 int dictid;
307 sds querybuf;
e8a74421 308 robj **argv, **mbargv;
309 int argc, mbargc;
40d224a9 310 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 311 int multibulk; /* multi bulk command format active */
ed9b544e 312 list *reply;
313 int sentlen;
314 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 315 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 316 int slaveseldb; /* slave selected db, if this client is a slave */
317 int authenticated; /* when requirepass is non-NULL */
318 int replstate; /* replication state if this is a slave */
319 int repldbfd; /* replication DB file descriptor */
6e469882 320 long repldboff; /* replication DB file offset */
40d224a9 321 off_t repldbsize; /* replication DB file size */
6e469882 322 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 323 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 324 * operation such as BLPOP. Otherwise NULL. */
b177fd30 325 int blockingkeysnum; /* Number of blocking keys */
4409877e 326 time_t blockingto; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
92f8e882 328 list *io_keys; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
ed9b544e 330} redisClient;
331
332struct saveparam {
333 time_t seconds;
334 int changes;
335};
336
337/* Global server state structure */
338struct redisServer {
339 int port;
340 int fd;
3305306f 341 redisDb *db;
4409877e 342 dict *sharingpool; /* Poll used for object sharing */
10c43610 343 unsigned int sharingpoolsize;
ed9b544e 344 long long dirty; /* changes to DB from the last save */
345 list *clients;
87eca727 346 list *slaves, *monitors;
ed9b544e 347 char neterr[ANET_ERR_LEN];
348 aeEventLoop *el;
349 int cronloops; /* number of times the cron function run */
350 list *objfreelist; /* A list of freed objects to avoid malloc() */
351 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 352 /* Fields used only for stats */
353 time_t stat_starttime; /* server start time */
354 long long stat_numcommands; /* number of processed commands */
355 long long stat_numconnections; /* number of connections received */
356 /* Configuration */
357 int verbosity;
358 int glueoutputbuf;
359 int maxidletime;
360 int dbnum;
361 int daemonize;
44b38ef4 362 int appendonly;
48f0308a 363 int appendfsync;
364 time_t lastfsync;
44b38ef4 365 int appendfd;
366 int appendseldb;
ed329fcf 367 char *pidfile;
9f3c422c 368 pid_t bgsavechildpid;
9d65a1bb 369 pid_t bgrewritechildpid;
370 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 371 struct saveparam *saveparams;
372 int saveparamslen;
373 char *logfile;
374 char *bindaddr;
375 char *dbfilename;
44b38ef4 376 char *appendfilename;
abcb223e 377 char *requirepass;
10c43610 378 int shareobjects;
121f70cf 379 int rdbcompression;
ed9b544e 380 /* Replication related */
381 int isslave;
d0ccebcf 382 char *masterauth;
ed9b544e 383 char *masterhost;
384 int masterport;
40d224a9 385 redisClient *master; /* client that is master for this slave */
ed9b544e 386 int replstate;
285add55 387 unsigned int maxclients;
4ef8de8a 388 unsigned long long maxmemory;
d5d55fc3 389 unsigned int blpop_blocked_clients;
390 unsigned int vm_blocked_clients;
ed9b544e 391 /* Sort parameters - qsort_r() is only available under BSD so we
392 * have to take this state global, in order to pass it to sortCompare() */
393 int sort_desc;
394 int sort_alpha;
395 int sort_bypattern;
75680a3c 396 /* Virtual memory configuration */
397 int vm_enabled;
054e426d 398 char *vm_swap_file;
75680a3c 399 off_t vm_page_size;
400 off_t vm_pages;
4ef8de8a 401 unsigned long long vm_max_memory;
cbba7dd7 402 /* Hashes config */
403 size_t hash_max_zipmap_entries;
404 size_t hash_max_zipmap_value;
75680a3c 405 /* Virtual memory state */
406 FILE *vm_fp;
407 int vm_fd;
408 off_t vm_next_page; /* Next probably empty page */
409 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 410 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 411 time_t unixtime; /* Unix time sampled every second. */
92f8e882 412 /* Virtual memory I/O threads stuff */
92f8e882 413 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 414 * put the result of the operation in the io_done list. While the
415 * job is being processed, it's put on io_processing queue. */
416 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
417 list *io_processing; /* List of VM I/O jobs being processed */
418 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 419 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 420 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 421 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
422 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 423 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 424 int io_active_threads; /* Number of running I/O threads */
425 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 426 /* Our main thread is blocked on the event loop, locking for sockets ready
427 * to be read or written, so when a threaded I/O operation is ready to be
428 * processed by the main thread, the I/O thread will use a unix pipe to
429 * awake the main thread. The followings are the two pipe FDs. */
430 int io_ready_pipe_read;
431 int io_ready_pipe_write;
7d98e08c 432 /* Virtual memory stats */
433 unsigned long long vm_stats_used_pages;
434 unsigned long long vm_stats_swapped_objects;
435 unsigned long long vm_stats_swapouts;
436 unsigned long long vm_stats_swapins;
b9bc0eef 437 FILE *devnull;
ed9b544e 438};
439
440typedef void redisCommandProc(redisClient *c);
441struct redisCommand {
442 char *name;
443 redisCommandProc *proc;
444 int arity;
445 int flags;
7c775e09 446 /* What keys should be loaded in background when calling this command? */
447 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
448 int vm_lastkey; /* THe last argument that's a key */
449 int vm_keystep; /* The step between first and last key */
ed9b544e 450};
451
de96dbfe 452struct redisFunctionSym {
453 char *name;
56906eef 454 unsigned long pointer;
de96dbfe 455};
456
ed9b544e 457typedef struct _redisSortObject {
458 robj *obj;
459 union {
460 double score;
461 robj *cmpobj;
462 } u;
463} redisSortObject;
464
465typedef struct _redisSortOperation {
466 int type;
467 robj *pattern;
468} redisSortOperation;
469
6b47e12e 470/* ZSETs use a specialized version of Skiplists */
471
472typedef struct zskiplistNode {
473 struct zskiplistNode **forward;
e3870fab 474 struct zskiplistNode *backward;
912b9165 475 unsigned int *span;
6b47e12e 476 double score;
477 robj *obj;
478} zskiplistNode;
479
480typedef struct zskiplist {
e3870fab 481 struct zskiplistNode *header, *tail;
d13f767c 482 unsigned long length;
6b47e12e 483 int level;
484} zskiplist;
485
1812e024 486typedef struct zset {
487 dict *dict;
6b47e12e 488 zskiplist *zsl;
1812e024 489} zset;
490
6b47e12e 491/* Our shared "common" objects */
492
ed9b544e 493struct sharedObjectsStruct {
c937aa89 494 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 495 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 496 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
497 *outofrangeerr, *plus,
ed9b544e 498 *select0, *select1, *select2, *select3, *select4,
499 *select5, *select6, *select7, *select8, *select9;
500} shared;
501
a7866db6 502/* Global vars that are actally used as constants. The following double
503 * values are used for double on-disk serialization, and are initialized
504 * at runtime to avoid strange compiler optimizations. */
505
506static double R_Zero, R_PosInf, R_NegInf, R_Nan;
507
92f8e882 508/* VM threaded I/O request message */
b9bc0eef 509#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
510#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
511#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 512typedef struct iojob {
996cb5f7 513 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 514 redisDb *db;/* Redis database */
92f8e882 515 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 516 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 517 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
518 off_t page; /* Swap page where to read/write the object */
b9bc0eef 519 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
996cb5f7 520 int canceled; /* True if this command was canceled by blocking side of VM */
521 pthread_t thread; /* ID of the thread processing this entry */
522} iojob;
92f8e882 523
ed9b544e 524/*================================ Prototypes =============================== */
525
526static void freeStringObject(robj *o);
527static void freeListObject(robj *o);
528static void freeSetObject(robj *o);
529static void decrRefCount(void *o);
530static robj *createObject(int type, void *ptr);
531static void freeClient(redisClient *c);
f78fd11b 532static int rdbLoad(char *filename);
ed9b544e 533static void addReply(redisClient *c, robj *obj);
534static void addReplySds(redisClient *c, sds s);
535static void incrRefCount(robj *o);
f78fd11b 536static int rdbSaveBackground(char *filename);
ed9b544e 537static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 538static robj *dupStringObject(robj *o);
87eca727 539static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
44b38ef4 540static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 541static int syncWithMaster(void);
10c43610 542static robj *tryObjectSharing(robj *o);
942a3961 543static int tryObjectEncoding(robj *o);
9d65a1bb 544static robj *getDecodedObject(robj *o);
3305306f 545static int removeExpire(redisDb *db, robj *key);
546static int expireIfNeeded(redisDb *db, robj *key);
547static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 548static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 549static int deleteKey(redisDb *db, robj *key);
bb32ede5 550static time_t getExpire(redisDb *db, robj *key);
551static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 552static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 553static void freeMemoryIfNeeded(void);
de96dbfe 554static int processCommand(redisClient *c);
56906eef 555static void setupSigSegvAction(void);
a3b21203 556static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 557static void aofRemoveTempFile(pid_t childpid);
0ea663ea 558static size_t stringObjectLen(robj *o);
638e42ac 559static void processInputBuffer(redisClient *c);
6b47e12e 560static zskiplist *zslCreate(void);
fd8ccf44 561static void zslFree(zskiplist *zsl);
2b59cfdf 562static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 563static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 564static void initClientMultiState(redisClient *c);
565static void freeClientMultiState(redisClient *c);
566static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 567static void unblockClientWaitingData(redisClient *c);
4409877e 568static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 569static void vmInit(void);
a35ddf12 570static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 571static robj *vmLoadObject(robj *key);
7e69548d 572static robj *vmPreviewObject(robj *key);
a69a0c9c 573static int vmSwapOneObjectBlocking(void);
574static int vmSwapOneObjectThreaded(void);
7e69548d 575static int vmCanSwapOut(void);
a5819310 576static int tryFreeOneObjectFromFreelist(void);
996cb5f7 577static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
578static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
579static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 580static void lockThreadedIO(void);
581static void unlockThreadedIO(void);
582static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
583static void freeIOJob(iojob *j);
584static void queueIOJob(iojob *j);
a5819310 585static int vmWriteObjectOnSwap(robj *o, off_t page);
586static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 587static void waitEmptyIOJobsQueue(void);
588static void vmReopenSwapFile(void);
970e10bb 589static int vmFreePage(off_t page);
d5d55fc3 590static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
591static int dontWaitForSwappedKey(redisClient *c, robj *key);
592static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
593static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
594static struct redisCommand *lookupCommand(char *name);
595static void call(redisClient *c, struct redisCommand *cmd);
596static void resetClient(redisClient *c);
ada386b2 597static void convertToRealHash(robj *o);
ed9b544e 598
abcb223e 599static void authCommand(redisClient *c);
ed9b544e 600static void pingCommand(redisClient *c);
601static void echoCommand(redisClient *c);
602static void setCommand(redisClient *c);
603static void setnxCommand(redisClient *c);
604static void getCommand(redisClient *c);
605static void delCommand(redisClient *c);
606static void existsCommand(redisClient *c);
607static void incrCommand(redisClient *c);
608static void decrCommand(redisClient *c);
609static void incrbyCommand(redisClient *c);
610static void decrbyCommand(redisClient *c);
611static void selectCommand(redisClient *c);
612static void randomkeyCommand(redisClient *c);
613static void keysCommand(redisClient *c);
614static void dbsizeCommand(redisClient *c);
615static void lastsaveCommand(redisClient *c);
616static void saveCommand(redisClient *c);
617static void bgsaveCommand(redisClient *c);
9d65a1bb 618static void bgrewriteaofCommand(redisClient *c);
ed9b544e 619static void shutdownCommand(redisClient *c);
620static void moveCommand(redisClient *c);
621static void renameCommand(redisClient *c);
622static void renamenxCommand(redisClient *c);
623static void lpushCommand(redisClient *c);
624static void rpushCommand(redisClient *c);
625static void lpopCommand(redisClient *c);
626static void rpopCommand(redisClient *c);
627static void llenCommand(redisClient *c);
628static void lindexCommand(redisClient *c);
629static void lrangeCommand(redisClient *c);
630static void ltrimCommand(redisClient *c);
631static void typeCommand(redisClient *c);
632static void lsetCommand(redisClient *c);
633static void saddCommand(redisClient *c);
634static void sremCommand(redisClient *c);
a4460ef4 635static void smoveCommand(redisClient *c);
ed9b544e 636static void sismemberCommand(redisClient *c);
637static void scardCommand(redisClient *c);
12fea928 638static void spopCommand(redisClient *c);
2abb95a9 639static void srandmemberCommand(redisClient *c);
ed9b544e 640static void sinterCommand(redisClient *c);
641static void sinterstoreCommand(redisClient *c);
40d224a9 642static void sunionCommand(redisClient *c);
643static void sunionstoreCommand(redisClient *c);
f4f56e1d 644static void sdiffCommand(redisClient *c);
645static void sdiffstoreCommand(redisClient *c);
ed9b544e 646static void syncCommand(redisClient *c);
647static void flushdbCommand(redisClient *c);
648static void flushallCommand(redisClient *c);
649static void sortCommand(redisClient *c);
650static void lremCommand(redisClient *c);
0f5f7e9a 651static void rpoplpushcommand(redisClient *c);
ed9b544e 652static void infoCommand(redisClient *c);
70003d28 653static void mgetCommand(redisClient *c);
87eca727 654static void monitorCommand(redisClient *c);
3305306f 655static void expireCommand(redisClient *c);
802e8373 656static void expireatCommand(redisClient *c);
f6b141c5 657static void getsetCommand(redisClient *c);
fd88489a 658static void ttlCommand(redisClient *c);
321b0e13 659static void slaveofCommand(redisClient *c);
7f957c92 660static void debugCommand(redisClient *c);
f6b141c5 661static void msetCommand(redisClient *c);
662static void msetnxCommand(redisClient *c);
fd8ccf44 663static void zaddCommand(redisClient *c);
7db723ad 664static void zincrbyCommand(redisClient *c);
cc812361 665static void zrangeCommand(redisClient *c);
50c55df5 666static void zrangebyscoreCommand(redisClient *c);
f44dd428 667static void zcountCommand(redisClient *c);
e3870fab 668static void zrevrangeCommand(redisClient *c);
3c41331e 669static void zcardCommand(redisClient *c);
1b7106e7 670static void zremCommand(redisClient *c);
6e333bbe 671static void zscoreCommand(redisClient *c);
1807985b 672static void zremrangebyscoreCommand(redisClient *c);
6e469882 673static void multiCommand(redisClient *c);
674static void execCommand(redisClient *c);
18b6cb76 675static void discardCommand(redisClient *c);
4409877e 676static void blpopCommand(redisClient *c);
677static void brpopCommand(redisClient *c);
4b00bebd 678static void appendCommand(redisClient *c);
39191553 679static void substrCommand(redisClient *c);
69d95c3e 680static void zrankCommand(redisClient *c);
798d9e55 681static void zrevrankCommand(redisClient *c);
978c2c94 682static void hsetCommand(redisClient *c);
683static void hgetCommand(redisClient *c);
07efaf74 684static void hdelCommand(redisClient *c);
92b27fe9 685static void hlenCommand(redisClient *c);
9212eafd 686static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
687static void zunionCommand(redisClient *c);
688static void zinterCommand(redisClient *c);
f6b141c5 689
ed9b544e 690/*================================= Globals ================================= */
691
692/* Global vars */
693static struct redisServer server; /* server global state */
694static struct redisCommand cmdTable[] = {
7c775e09 695 {"get",getCommand,2,REDIS_CMD_INLINE,1,1,1},
696 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
697 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
698 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
39191553 699 {"substr",substrCommand,4,REDIS_CMD_INLINE,1,1,1},
7c775e09 700 {"del",delCommand,-2,REDIS_CMD_INLINE,0,0,0},
701 {"exists",existsCommand,2,REDIS_CMD_INLINE,1,1,1},
702 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
703 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
704 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,1,-1,1},
705 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
706 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
707 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,1,1,1},
708 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,1,1,1},
709 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
710 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
711 {"llen",llenCommand,2,REDIS_CMD_INLINE,1,1,1},
712 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,1,1,1},
713 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
714 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,1,1,1},
715 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,1,1,1},
716 {"lrem",lremCommand,4,REDIS_CMD_BULK,1,1,1},
717 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,2,1},
718 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
719 {"srem",sremCommand,3,REDIS_CMD_BULK,1,1,1},
720 {"smove",smoveCommand,4,REDIS_CMD_BULK,1,2,1},
721 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,1,1,1},
722 {"scard",scardCommand,2,REDIS_CMD_INLINE,1,1,1},
723 {"spop",spopCommand,2,REDIS_CMD_INLINE,1,1,1},
724 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,1,1,1},
725 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
726 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
727 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
728 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
729 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
730 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
731 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,1,1,1},
732 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
733 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
734 {"zrem",zremCommand,3,REDIS_CMD_BULK,1,1,1},
735 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,1,1,1},
9212eafd 736 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,1,1,1},
2830ca53
PN
737 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,0,0,0},
738 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,0,0,0},
7c775e09 739 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
740 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,1,1,1},
741 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,1,1,1},
742 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
743 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,1,1,1},
744 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
cc3b01c3 745 {"zrank",zrankCommand,3,REDIS_CMD_BULK,1,1,1},
653c9240 746 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,1,1,1},
978c2c94 747 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
748 {"hget",hgetCommand,3,REDIS_CMD_BULK,1,1,1},
07efaf74 749 {"hdel",hdelCommand,3,REDIS_CMD_BULK,1,1,1},
92b27fe9 750 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,1,1,1},
7c775e09 751 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
752 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
753 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
754 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
755 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
756 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,0,0,0},
757 {"select",selectCommand,2,REDIS_CMD_INLINE,0,0,0},
758 {"move",moveCommand,3,REDIS_CMD_INLINE,1,1,1},
759 {"rename",renameCommand,3,REDIS_CMD_INLINE,1,1,1},
760 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,1,1,1},
761 {"expire",expireCommand,3,REDIS_CMD_INLINE,0,0,0},
762 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,0,0,0},
763 {"keys",keysCommand,2,REDIS_CMD_INLINE,0,0,0},
764 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,0,0,0},
765 {"auth",authCommand,2,REDIS_CMD_INLINE,0,0,0},
766 {"ping",pingCommand,1,REDIS_CMD_INLINE,0,0,0},
767 {"echo",echoCommand,2,REDIS_CMD_BULK,0,0,0},
768 {"save",saveCommand,1,REDIS_CMD_INLINE,0,0,0},
769 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
770 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,0,0,0},
771 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,0,0,0},
772 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
773 {"type",typeCommand,2,REDIS_CMD_INLINE,1,1,1},
774 {"multi",multiCommand,1,REDIS_CMD_INLINE,0,0,0},
775 {"exec",execCommand,1,REDIS_CMD_INLINE,0,0,0},
18b6cb76 776 {"discard",discardCommand,1,REDIS_CMD_INLINE,0,0,0},
7c775e09 777 {"sync",syncCommand,1,REDIS_CMD_INLINE,0,0,0},
778 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,0,0,0},
779 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,0,0,0},
780 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
781 {"info",infoCommand,1,REDIS_CMD_INLINE,0,0,0},
782 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,0,0,0},
783 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,1,1,1},
784 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,0,0,0},
785 {"debug",debugCommand,-2,REDIS_CMD_INLINE,0,0,0},
786 {NULL,NULL,0,0,0,0,0}
ed9b544e 787};
bcfc686d 788
ed9b544e 789/*============================ Utility functions ============================ */
790
791/* Glob-style pattern matching. */
792int stringmatchlen(const char *pattern, int patternLen,
793 const char *string, int stringLen, int nocase)
794{
795 while(patternLen) {
796 switch(pattern[0]) {
797 case '*':
798 while (pattern[1] == '*') {
799 pattern++;
800 patternLen--;
801 }
802 if (patternLen == 1)
803 return 1; /* match */
804 while(stringLen) {
805 if (stringmatchlen(pattern+1, patternLen-1,
806 string, stringLen, nocase))
807 return 1; /* match */
808 string++;
809 stringLen--;
810 }
811 return 0; /* no match */
812 break;
813 case '?':
814 if (stringLen == 0)
815 return 0; /* no match */
816 string++;
817 stringLen--;
818 break;
819 case '[':
820 {
821 int not, match;
822
823 pattern++;
824 patternLen--;
825 not = pattern[0] == '^';
826 if (not) {
827 pattern++;
828 patternLen--;
829 }
830 match = 0;
831 while(1) {
832 if (pattern[0] == '\\') {
833 pattern++;
834 patternLen--;
835 if (pattern[0] == string[0])
836 match = 1;
837 } else if (pattern[0] == ']') {
838 break;
839 } else if (patternLen == 0) {
840 pattern--;
841 patternLen++;
842 break;
843 } else if (pattern[1] == '-' && patternLen >= 3) {
844 int start = pattern[0];
845 int end = pattern[2];
846 int c = string[0];
847 if (start > end) {
848 int t = start;
849 start = end;
850 end = t;
851 }
852 if (nocase) {
853 start = tolower(start);
854 end = tolower(end);
855 c = tolower(c);
856 }
857 pattern += 2;
858 patternLen -= 2;
859 if (c >= start && c <= end)
860 match = 1;
861 } else {
862 if (!nocase) {
863 if (pattern[0] == string[0])
864 match = 1;
865 } else {
866 if (tolower((int)pattern[0]) == tolower((int)string[0]))
867 match = 1;
868 }
869 }
870 pattern++;
871 patternLen--;
872 }
873 if (not)
874 match = !match;
875 if (!match)
876 return 0; /* no match */
877 string++;
878 stringLen--;
879 break;
880 }
881 case '\\':
882 if (patternLen >= 2) {
883 pattern++;
884 patternLen--;
885 }
886 /* fall through */
887 default:
888 if (!nocase) {
889 if (pattern[0] != string[0])
890 return 0; /* no match */
891 } else {
892 if (tolower((int)pattern[0]) != tolower((int)string[0]))
893 return 0; /* no match */
894 }
895 string++;
896 stringLen--;
897 break;
898 }
899 pattern++;
900 patternLen--;
901 if (stringLen == 0) {
902 while(*pattern == '*') {
903 pattern++;
904 patternLen--;
905 }
906 break;
907 }
908 }
909 if (patternLen == 0 && stringLen == 0)
910 return 1;
911 return 0;
912}
913
56906eef 914static void redisLog(int level, const char *fmt, ...) {
ed9b544e 915 va_list ap;
916 FILE *fp;
917
918 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
919 if (!fp) return;
920
921 va_start(ap, fmt);
922 if (level >= server.verbosity) {
6766f45e 923 char *c = ".-*#";
1904ecc1 924 char buf[64];
925 time_t now;
926
927 now = time(NULL);
6c9385e0 928 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 929 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 930 vfprintf(fp, fmt, ap);
931 fprintf(fp,"\n");
932 fflush(fp);
933 }
934 va_end(ap);
935
936 if (server.logfile) fclose(fp);
937}
938
939/*====================== Hash table type implementation ==================== */
940
941/* This is an hash table type that uses the SDS dynamic strings libary as
942 * keys and radis objects as values (objects can hold SDS strings,
943 * lists, sets). */
944
1812e024 945static void dictVanillaFree(void *privdata, void *val)
946{
947 DICT_NOTUSED(privdata);
948 zfree(val);
949}
950
4409877e 951static void dictListDestructor(void *privdata, void *val)
952{
953 DICT_NOTUSED(privdata);
954 listRelease((list*)val);
955}
956
ed9b544e 957static int sdsDictKeyCompare(void *privdata, const void *key1,
958 const void *key2)
959{
960 int l1,l2;
961 DICT_NOTUSED(privdata);
962
963 l1 = sdslen((sds)key1);
964 l2 = sdslen((sds)key2);
965 if (l1 != l2) return 0;
966 return memcmp(key1, key2, l1) == 0;
967}
968
969static void dictRedisObjectDestructor(void *privdata, void *val)
970{
971 DICT_NOTUSED(privdata);
972
a35ddf12 973 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 974 decrRefCount(val);
975}
976
942a3961 977static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 978 const void *key2)
979{
980 const robj *o1 = key1, *o2 = key2;
981 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
982}
983
942a3961 984static unsigned int dictObjHash(const void *key) {
ed9b544e 985 const robj *o = key;
986 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
987}
988
942a3961 989static int dictEncObjKeyCompare(void *privdata, const void *key1,
990 const void *key2)
991{
9d65a1bb 992 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
993 int cmp;
942a3961 994
9d65a1bb 995 o1 = getDecodedObject(o1);
996 o2 = getDecodedObject(o2);
997 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
998 decrRefCount(o1);
999 decrRefCount(o2);
1000 return cmp;
942a3961 1001}
1002
1003static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1004 robj *o = (robj*) key;
942a3961 1005
ed9e4966 1006 if (o->encoding == REDIS_ENCODING_RAW) {
1007 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1008 } else {
1009 if (o->encoding == REDIS_ENCODING_INT) {
1010 char buf[32];
1011 int len;
1012
1013 len = snprintf(buf,32,"%ld",(long)o->ptr);
1014 return dictGenHashFunction((unsigned char*)buf, len);
1015 } else {
1016 unsigned int hash;
1017
1018 o = getDecodedObject(o);
1019 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1020 decrRefCount(o);
1021 return hash;
1022 }
1023 }
942a3961 1024}
1025
f2d9f50f 1026/* Sets type and expires */
ed9b544e 1027static dictType setDictType = {
942a3961 1028 dictEncObjHash, /* hash function */
ed9b544e 1029 NULL, /* key dup */
1030 NULL, /* val dup */
942a3961 1031 dictEncObjKeyCompare, /* key compare */
ed9b544e 1032 dictRedisObjectDestructor, /* key destructor */
1033 NULL /* val destructor */
1034};
1035
f2d9f50f 1036/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1037static dictType zsetDictType = {
1038 dictEncObjHash, /* hash function */
1039 NULL, /* key dup */
1040 NULL, /* val dup */
1041 dictEncObjKeyCompare, /* key compare */
1042 dictRedisObjectDestructor, /* key destructor */
da0a1620 1043 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1044};
1045
f2d9f50f 1046/* Db->dict */
5234952b 1047static dictType dbDictType = {
942a3961 1048 dictObjHash, /* hash function */
ed9b544e 1049 NULL, /* key dup */
1050 NULL, /* val dup */
942a3961 1051 dictObjKeyCompare, /* key compare */
ed9b544e 1052 dictRedisObjectDestructor, /* key destructor */
1053 dictRedisObjectDestructor /* val destructor */
1054};
1055
f2d9f50f 1056/* Db->expires */
1057static dictType keyptrDictType = {
1058 dictObjHash, /* hash function */
1059 NULL, /* key dup */
1060 NULL, /* val dup */
1061 dictObjKeyCompare, /* key compare */
1062 dictRedisObjectDestructor, /* key destructor */
1063 NULL /* val destructor */
1064};
1065
5234952b 1066/* Hash type hash table (note that small hashes are represented with zimpaps) */
1067static dictType hashDictType = {
1068 dictEncObjHash, /* hash function */
1069 NULL, /* key dup */
1070 NULL, /* val dup */
1071 dictEncObjKeyCompare, /* key compare */
1072 dictRedisObjectDestructor, /* key destructor */
1073 dictRedisObjectDestructor /* val destructor */
1074};
1075
4409877e 1076/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1077 * lists as values. It's used for blocking operations (BLPOP) and to
1078 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1079static dictType keylistDictType = {
1080 dictObjHash, /* hash function */
1081 NULL, /* key dup */
1082 NULL, /* val dup */
1083 dictObjKeyCompare, /* key compare */
1084 dictRedisObjectDestructor, /* key destructor */
1085 dictListDestructor /* val destructor */
1086};
1087
ed9b544e 1088/* ========================= Random utility functions ======================= */
1089
1090/* Redis generally does not try to recover from out of memory conditions
1091 * when allocating objects or strings, it is not clear if it will be possible
1092 * to report this condition to the client since the networking layer itself
1093 * is based on heap allocation for send buffers, so we simply abort.
1094 * At least the code will be simpler to read... */
1095static void oom(const char *msg) {
71c54b21 1096 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1097 sleep(1);
1098 abort();
1099}
1100
1101/* ====================== Redis server networking stuff ===================== */
56906eef 1102static void closeTimedoutClients(void) {
ed9b544e 1103 redisClient *c;
ed9b544e 1104 listNode *ln;
1105 time_t now = time(NULL);
c7df85a4 1106 listIter li;
ed9b544e 1107
c7df85a4 1108 listRewind(server.clients,&li);
1109 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1110 c = listNodeValue(ln);
f86a74e9 1111 if (server.maxidletime &&
1112 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1113 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
f86a74e9 1114 (now - c->lastinteraction > server.maxidletime))
1115 {
f870935d 1116 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1117 freeClient(c);
f86a74e9 1118 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1119 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1120 addReply(c,shared.nullmultibulk);
b0d8747d 1121 unblockClientWaitingData(c);
f86a74e9 1122 }
ed9b544e 1123 }
1124 }
ed9b544e 1125}
1126
12fea928 1127static int htNeedsResize(dict *dict) {
1128 long long size, used;
1129
1130 size = dictSlots(dict);
1131 used = dictSize(dict);
1132 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1133 (used*100/size < REDIS_HT_MINFILL));
1134}
1135
0bc03378 1136/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1137 * we resize the hash table to save memory */
56906eef 1138static void tryResizeHashTables(void) {
0bc03378 1139 int j;
1140
1141 for (j = 0; j < server.dbnum; j++) {
12fea928 1142 if (htNeedsResize(server.db[j].dict)) {
f870935d 1143 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
0bc03378 1144 dictResize(server.db[j].dict);
f870935d 1145 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
0bc03378 1146 }
12fea928 1147 if (htNeedsResize(server.db[j].expires))
1148 dictResize(server.db[j].expires);
0bc03378 1149 }
1150}
1151
9d65a1bb 1152/* A background saving child (BGSAVE) terminated its work. Handle this. */
1153void backgroundSaveDoneHandler(int statloc) {
1154 int exitcode = WEXITSTATUS(statloc);
1155 int bysignal = WIFSIGNALED(statloc);
1156
1157 if (!bysignal && exitcode == 0) {
1158 redisLog(REDIS_NOTICE,
1159 "Background saving terminated with success");
1160 server.dirty = 0;
1161 server.lastsave = time(NULL);
1162 } else if (!bysignal && exitcode != 0) {
1163 redisLog(REDIS_WARNING, "Background saving error");
1164 } else {
1165 redisLog(REDIS_WARNING,
1166 "Background saving terminated by signal");
1167 rdbRemoveTempFile(server.bgsavechildpid);
1168 }
1169 server.bgsavechildpid = -1;
1170 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1171 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1172 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1173}
1174
1175/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1176 * Handle this. */
1177void backgroundRewriteDoneHandler(int statloc) {
1178 int exitcode = WEXITSTATUS(statloc);
1179 int bysignal = WIFSIGNALED(statloc);
1180
1181 if (!bysignal && exitcode == 0) {
1182 int fd;
1183 char tmpfile[256];
1184
1185 redisLog(REDIS_NOTICE,
1186 "Background append only file rewriting terminated with success");
1187 /* Now it's time to flush the differences accumulated by the parent */
1188 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1189 fd = open(tmpfile,O_WRONLY|O_APPEND);
1190 if (fd == -1) {
1191 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1192 goto cleanup;
1193 }
1194 /* Flush our data... */
1195 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1196 (signed) sdslen(server.bgrewritebuf)) {
1197 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1198 close(fd);
1199 goto cleanup;
1200 }
b32627cd 1201 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1202 /* Now our work is to rename the temp file into the stable file. And
1203 * switch the file descriptor used by the server for append only. */
1204 if (rename(tmpfile,server.appendfilename) == -1) {
1205 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1206 close(fd);
1207 goto cleanup;
1208 }
1209 /* Mission completed... almost */
1210 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1211 if (server.appendfd != -1) {
1212 /* If append only is actually enabled... */
1213 close(server.appendfd);
1214 server.appendfd = fd;
1215 fsync(fd);
85a83172 1216 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1217 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1218 } else {
1219 /* If append only is disabled we just generate a dump in this
1220 * format. Why not? */
1221 close(fd);
1222 }
1223 } else if (!bysignal && exitcode != 0) {
1224 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1225 } else {
1226 redisLog(REDIS_WARNING,
1227 "Background append only file rewriting terminated by signal");
1228 }
1229cleanup:
1230 sdsfree(server.bgrewritebuf);
1231 server.bgrewritebuf = sdsempty();
1232 aofRemoveTempFile(server.bgrewritechildpid);
1233 server.bgrewritechildpid = -1;
1234}
1235
56906eef 1236static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1237 int j, loops = server.cronloops++;
ed9b544e 1238 REDIS_NOTUSED(eventLoop);
1239 REDIS_NOTUSED(id);
1240 REDIS_NOTUSED(clientData);
1241
3a66edc7 1242 /* We take a cached value of the unix time in the global state because
1243 * with virtual memory and aging there is to store the current time
1244 * in objects at every object access, and accuracy is not needed.
1245 * To access a global var is faster than calling time(NULL) */
1246 server.unixtime = time(NULL);
1247
0bc03378 1248 /* Show some info about non-empty databases */
ed9b544e 1249 for (j = 0; j < server.dbnum; j++) {
dec423d9 1250 long long size, used, vkeys;
94754ccc 1251
3305306f 1252 size = dictSlots(server.db[j].dict);
1253 used = dictSize(server.db[j].dict);
94754ccc 1254 vkeys = dictSize(server.db[j].expires);
c3cb078d 1255 if (!(loops % 5) && (used || vkeys)) {
f870935d 1256 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1257 /* dictPrintStats(server.dict); */
ed9b544e 1258 }
ed9b544e 1259 }
1260
0bc03378 1261 /* We don't want to resize the hash tables while a bacground saving
1262 * is in progress: the saving child is created using fork() that is
1263 * implemented with a copy-on-write semantic in most modern systems, so
1264 * if we resize the HT while there is the saving child at work actually
1265 * a lot of memory movements in the parent will cause a lot of pages
1266 * copied. */
9d65a1bb 1267 if (server.bgsavechildpid == -1) tryResizeHashTables();
0bc03378 1268
ed9b544e 1269 /* Show information about connected clients */
1270 if (!(loops % 5)) {
f870935d 1271 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
ed9b544e 1272 listLength(server.clients)-listLength(server.slaves),
1273 listLength(server.slaves),
b72f6a4b 1274 zmalloc_used_memory(),
3305306f 1275 dictSize(server.sharingpool));
ed9b544e 1276 }
1277
1278 /* Close connections of timedout clients */
d5d55fc3 1279 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
ed9b544e 1280 closeTimedoutClients();
1281
9d65a1bb 1282 /* Check if a background saving or AOF rewrite in progress terminated */
1283 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1284 int statloc;
9d65a1bb 1285 pid_t pid;
1286
1287 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1288 if (pid == server.bgsavechildpid) {
1289 backgroundSaveDoneHandler(statloc);
ed9b544e 1290 } else {
9d65a1bb 1291 backgroundRewriteDoneHandler(statloc);
ed9b544e 1292 }
ed9b544e 1293 }
1294 } else {
1295 /* If there is not a background saving in progress check if
1296 * we have to save now */
1297 time_t now = time(NULL);
1298 for (j = 0; j < server.saveparamslen; j++) {
1299 struct saveparam *sp = server.saveparams+j;
1300
1301 if (server.dirty >= sp->changes &&
1302 now-server.lastsave > sp->seconds) {
1303 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1304 sp->changes, sp->seconds);
f78fd11b 1305 rdbSaveBackground(server.dbfilename);
ed9b544e 1306 break;
1307 }
1308 }
1309 }
94754ccc 1310
f2324293 1311 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1312 * will use few CPU cycles if there are few expiring keys, otherwise
1313 * it will get more aggressive to avoid that too much memory is used by
1314 * keys that can be removed from the keyspace. */
94754ccc 1315 for (j = 0; j < server.dbnum; j++) {
f2324293 1316 int expired;
94754ccc 1317 redisDb *db = server.db+j;
94754ccc 1318
f2324293 1319 /* Continue to expire if at the end of the cycle more than 25%
1320 * of the keys were expired. */
1321 do {
4ef8de8a 1322 long num = dictSize(db->expires);
94754ccc 1323 time_t now = time(NULL);
1324
f2324293 1325 expired = 0;
94754ccc 1326 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1327 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1328 while (num--) {
1329 dictEntry *de;
1330 time_t t;
1331
1332 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1333 t = (time_t) dictGetEntryVal(de);
1334 if (now > t) {
1335 deleteKey(db,dictGetEntryKey(de));
f2324293 1336 expired++;
94754ccc 1337 }
1338 }
f2324293 1339 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1340 }
1341
4ef8de8a 1342 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1343 * is enbled. Try to free objects from the free list first. */
7e69548d 1344 if (vmCanSwapOut()) {
1345 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1346 server.vm_max_memory)
1347 {
72e9fd40 1348 int retval;
1349
a5819310 1350 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1351 retval = (server.vm_max_threads == 0) ?
1352 vmSwapOneObjectBlocking() :
1353 vmSwapOneObjectThreaded();
1354 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1355 zmalloc_used_memory() >
1356 (server.vm_max_memory+server.vm_max_memory/10))
1357 {
1358 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1359 }
72e9fd40 1360 /* Note that when using threade I/O we free just one object,
1361 * because anyway when the I/O thread in charge to swap this
1362 * object out will finish, the handler of completed jobs
1363 * will try to swap more objects if we are still out of memory. */
1364 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1365 }
1366 }
1367
ed9b544e 1368 /* Check if we should connect to a MASTER */
1369 if (server.replstate == REDIS_REPL_CONNECT) {
1370 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1371 if (syncWithMaster() == REDIS_OK) {
1372 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1373 }
1374 }
1375 return 1000;
1376}
1377
d5d55fc3 1378/* This function gets called every time Redis is entering the
1379 * main loop of the event driven library, that is, before to sleep
1380 * for ready file descriptors. */
1381static void beforeSleep(struct aeEventLoop *eventLoop) {
1382 REDIS_NOTUSED(eventLoop);
1383
1384 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1385 listIter li;
1386 listNode *ln;
1387
1388 listRewind(server.io_ready_clients,&li);
1389 while((ln = listNext(&li))) {
1390 redisClient *c = ln->value;
1391 struct redisCommand *cmd;
1392
1393 /* Resume the client. */
1394 listDelNode(server.io_ready_clients,ln);
1395 c->flags &= (~REDIS_IO_WAIT);
1396 server.vm_blocked_clients--;
1397 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1398 readQueryFromClient, c);
1399 cmd = lookupCommand(c->argv[0]->ptr);
1400 assert(cmd != NULL);
1401 call(c,cmd);
1402 resetClient(c);
1403 /* There may be more data to process in the input buffer. */
1404 if (c->querybuf && sdslen(c->querybuf) > 0)
1405 processInputBuffer(c);
1406 }
1407 }
1408}
1409
ed9b544e 1410static void createSharedObjects(void) {
1411 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1412 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1413 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1414 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1415 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1416 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1417 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1418 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1419 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1420 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1421 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1422 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1423 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1424 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1425 "-ERR no such key\r\n"));
ed9b544e 1426 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1427 "-ERR syntax error\r\n"));
c937aa89 1428 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1429 "-ERR source and destination objects are the same\r\n"));
1430 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1431 "-ERR index out of range\r\n"));
ed9b544e 1432 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1433 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1434 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1435 shared.select0 = createStringObject("select 0\r\n",10);
1436 shared.select1 = createStringObject("select 1\r\n",10);
1437 shared.select2 = createStringObject("select 2\r\n",10);
1438 shared.select3 = createStringObject("select 3\r\n",10);
1439 shared.select4 = createStringObject("select 4\r\n",10);
1440 shared.select5 = createStringObject("select 5\r\n",10);
1441 shared.select6 = createStringObject("select 6\r\n",10);
1442 shared.select7 = createStringObject("select 7\r\n",10);
1443 shared.select8 = createStringObject("select 8\r\n",10);
1444 shared.select9 = createStringObject("select 9\r\n",10);
1445}
1446
1447static void appendServerSaveParams(time_t seconds, int changes) {
1448 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1449 server.saveparams[server.saveparamslen].seconds = seconds;
1450 server.saveparams[server.saveparamslen].changes = changes;
1451 server.saveparamslen++;
1452}
1453
bcfc686d 1454static void resetServerSaveParams() {
ed9b544e 1455 zfree(server.saveparams);
1456 server.saveparams = NULL;
1457 server.saveparamslen = 0;
1458}
1459
1460static void initServerConfig() {
1461 server.dbnum = REDIS_DEFAULT_DBNUM;
1462 server.port = REDIS_SERVERPORT;
f870935d 1463 server.verbosity = REDIS_VERBOSE;
ed9b544e 1464 server.maxidletime = REDIS_MAXIDLETIME;
1465 server.saveparams = NULL;
1466 server.logfile = NULL; /* NULL = log on standard output */
1467 server.bindaddr = NULL;
1468 server.glueoutputbuf = 1;
1469 server.daemonize = 0;
44b38ef4 1470 server.appendonly = 0;
4e141d5a 1471 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1472 server.lastfsync = time(NULL);
44b38ef4 1473 server.appendfd = -1;
1474 server.appendseldb = -1; /* Make sure the first time will not match */
ed329fcf 1475 server.pidfile = "/var/run/redis.pid";
ed9b544e 1476 server.dbfilename = "dump.rdb";
9d65a1bb 1477 server.appendfilename = "appendonly.aof";
abcb223e 1478 server.requirepass = NULL;
10c43610 1479 server.shareobjects = 0;
b0553789 1480 server.rdbcompression = 1;
21aecf4b 1481 server.sharingpoolsize = 1024;
285add55 1482 server.maxclients = 0;
d5d55fc3 1483 server.blpop_blocked_clients = 0;
3fd78bcd 1484 server.maxmemory = 0;
75680a3c 1485 server.vm_enabled = 0;
054e426d 1486 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1487 server.vm_page_size = 256; /* 256 bytes per page */
1488 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1489 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1490 server.vm_max_threads = 4;
d5d55fc3 1491 server.vm_blocked_clients = 0;
cbba7dd7 1492 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1493 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1494
bcfc686d 1495 resetServerSaveParams();
ed9b544e 1496
1497 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1498 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1499 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1500 /* Replication related */
1501 server.isslave = 0;
d0ccebcf 1502 server.masterauth = NULL;
ed9b544e 1503 server.masterhost = NULL;
1504 server.masterport = 6379;
1505 server.master = NULL;
1506 server.replstate = REDIS_REPL_NONE;
a7866db6 1507
1508 /* Double constants initialization */
1509 R_Zero = 0.0;
1510 R_PosInf = 1.0/R_Zero;
1511 R_NegInf = -1.0/R_Zero;
1512 R_Nan = R_Zero/R_Zero;
ed9b544e 1513}
1514
1515static void initServer() {
1516 int j;
1517
1518 signal(SIGHUP, SIG_IGN);
1519 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1520 setupSigSegvAction();
ed9b544e 1521
b9bc0eef 1522 server.devnull = fopen("/dev/null","w");
1523 if (server.devnull == NULL) {
1524 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1525 exit(1);
1526 }
ed9b544e 1527 server.clients = listCreate();
1528 server.slaves = listCreate();
87eca727 1529 server.monitors = listCreate();
ed9b544e 1530 server.objfreelist = listCreate();
1531 createSharedObjects();
1532 server.el = aeCreateEventLoop();
3305306f 1533 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
10c43610 1534 server.sharingpool = dictCreate(&setDictType,NULL);
ed9b544e 1535 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1536 if (server.fd == -1) {
1537 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1538 exit(1);
1539 }
3305306f 1540 for (j = 0; j < server.dbnum; j++) {
5234952b 1541 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1542 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1543 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1544 if (server.vm_enabled)
1545 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1546 server.db[j].id = j;
1547 }
ed9b544e 1548 server.cronloops = 0;
9f3c422c 1549 server.bgsavechildpid = -1;
9d65a1bb 1550 server.bgrewritechildpid = -1;
1551 server.bgrewritebuf = sdsempty();
ed9b544e 1552 server.lastsave = time(NULL);
1553 server.dirty = 0;
ed9b544e 1554 server.stat_numcommands = 0;
1555 server.stat_numconnections = 0;
1556 server.stat_starttime = time(NULL);
3a66edc7 1557 server.unixtime = time(NULL);
d8f8b666 1558 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1559 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1560 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1561
1562 if (server.appendonly) {
71eba477 1563 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1564 if (server.appendfd == -1) {
1565 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1566 strerror(errno));
1567 exit(1);
1568 }
1569 }
75680a3c 1570
1571 if (server.vm_enabled) vmInit();
ed9b544e 1572}
1573
1574/* Empty the whole database */
ca37e9cd 1575static long long emptyDb() {
ed9b544e 1576 int j;
ca37e9cd 1577 long long removed = 0;
ed9b544e 1578
3305306f 1579 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1580 removed += dictSize(server.db[j].dict);
3305306f 1581 dictEmpty(server.db[j].dict);
1582 dictEmpty(server.db[j].expires);
1583 }
ca37e9cd 1584 return removed;
ed9b544e 1585}
1586
85dd2f3a 1587static int yesnotoi(char *s) {
1588 if (!strcasecmp(s,"yes")) return 1;
1589 else if (!strcasecmp(s,"no")) return 0;
1590 else return -1;
1591}
1592
ed9b544e 1593/* I agree, this is a very rudimental way to load a configuration...
1594 will improve later if the config gets more complex */
1595static void loadServerConfig(char *filename) {
c9a111ac 1596 FILE *fp;
ed9b544e 1597 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1598 int linenum = 0;
1599 sds line = NULL;
c9a111ac 1600
1601 if (filename[0] == '-' && filename[1] == '\0')
1602 fp = stdin;
1603 else {
1604 if ((fp = fopen(filename,"r")) == NULL) {
1605 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1606 exit(1);
1607 }
ed9b544e 1608 }
c9a111ac 1609
ed9b544e 1610 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1611 sds *argv;
1612 int argc, j;
1613
1614 linenum++;
1615 line = sdsnew(buf);
1616 line = sdstrim(line," \t\r\n");
1617
1618 /* Skip comments and blank lines*/
1619 if (line[0] == '#' || line[0] == '\0') {
1620 sdsfree(line);
1621 continue;
1622 }
1623
1624 /* Split into arguments */
1625 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1626 sdstolower(argv[0]);
1627
1628 /* Execute config directives */
bb0b03a3 1629 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1630 server.maxidletime = atoi(argv[1]);
0150db36 1631 if (server.maxidletime < 0) {
ed9b544e 1632 err = "Invalid timeout value"; goto loaderr;
1633 }
bb0b03a3 1634 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1635 server.port = atoi(argv[1]);
1636 if (server.port < 1 || server.port > 65535) {
1637 err = "Invalid port"; goto loaderr;
1638 }
bb0b03a3 1639 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1640 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1641 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1642 int seconds = atoi(argv[1]);
1643 int changes = atoi(argv[2]);
1644 if (seconds < 1 || changes < 0) {
1645 err = "Invalid save parameters"; goto loaderr;
1646 }
1647 appendServerSaveParams(seconds,changes);
bb0b03a3 1648 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1649 if (chdir(argv[1]) == -1) {
1650 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1651 argv[1], strerror(errno));
1652 exit(1);
1653 }
bb0b03a3 1654 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1655 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1656 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1657 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1658 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1659 else {
1660 err = "Invalid log level. Must be one of debug, notice, warning";
1661 goto loaderr;
1662 }
bb0b03a3 1663 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1664 FILE *logfp;
ed9b544e 1665
1666 server.logfile = zstrdup(argv[1]);
bb0b03a3 1667 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1668 zfree(server.logfile);
1669 server.logfile = NULL;
1670 }
1671 if (server.logfile) {
1672 /* Test if we are able to open the file. The server will not
1673 * be able to abort just for this problem later... */
c9a111ac 1674 logfp = fopen(server.logfile,"a");
1675 if (logfp == NULL) {
ed9b544e 1676 err = sdscatprintf(sdsempty(),
1677 "Can't open the log file: %s", strerror(errno));
1678 goto loaderr;
1679 }
c9a111ac 1680 fclose(logfp);
ed9b544e 1681 }
bb0b03a3 1682 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1683 server.dbnum = atoi(argv[1]);
1684 if (server.dbnum < 1) {
1685 err = "Invalid number of databases"; goto loaderr;
1686 }
285add55 1687 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1688 server.maxclients = atoi(argv[1]);
3fd78bcd 1689 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1690 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1691 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1692 server.masterhost = sdsnew(argv[1]);
1693 server.masterport = atoi(argv[2]);
1694 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1695 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1696 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1697 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1698 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1699 err = "argument must be 'yes' or 'no'"; goto loaderr;
1700 }
bb0b03a3 1701 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1702 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1703 err = "argument must be 'yes' or 'no'"; goto loaderr;
1704 }
121f70cf 1705 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1706 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1707 err = "argument must be 'yes' or 'no'"; goto loaderr;
1708 }
e52c65b9 1709 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1710 server.sharingpoolsize = atoi(argv[1]);
1711 if (server.sharingpoolsize < 1) {
1712 err = "invalid object sharing pool size"; goto loaderr;
1713 }
bb0b03a3 1714 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1715 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1716 err = "argument must be 'yes' or 'no'"; goto loaderr;
1717 }
44b38ef4 1718 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1719 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1720 err = "argument must be 'yes' or 'no'"; goto loaderr;
1721 }
48f0308a 1722 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1723 if (!strcasecmp(argv[1],"no")) {
48f0308a 1724 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1725 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1726 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1727 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1728 server.appendfsync = APPENDFSYNC_EVERYSEC;
1729 } else {
1730 err = "argument must be 'no', 'always' or 'everysec'";
1731 goto loaderr;
1732 }
bb0b03a3 1733 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1734 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1735 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
054e426d 1736 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1737 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
054e426d 1738 server.dbfilename = zstrdup(argv[1]);
75680a3c 1739 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1740 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1741 err = "argument must be 'yes' or 'no'"; goto loaderr;
1742 }
054e426d 1743 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1744 zfree(server.vm_swap_file);
054e426d 1745 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1746 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1747 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1748 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1749 server.vm_page_size = strtoll(argv[1], NULL, 10);
1750 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1751 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1752 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1753 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1754 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1755 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1756 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1757 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1758 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1759 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1760 } else {
1761 err = "Bad directive or wrong number of arguments"; goto loaderr;
1762 }
1763 for (j = 0; j < argc; j++)
1764 sdsfree(argv[j]);
1765 zfree(argv);
1766 sdsfree(line);
1767 }
c9a111ac 1768 if (fp != stdin) fclose(fp);
ed9b544e 1769 return;
1770
1771loaderr:
1772 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1773 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1774 fprintf(stderr, ">>> '%s'\n", line);
1775 fprintf(stderr, "%s\n", err);
1776 exit(1);
1777}
1778
1779static void freeClientArgv(redisClient *c) {
1780 int j;
1781
1782 for (j = 0; j < c->argc; j++)
1783 decrRefCount(c->argv[j]);
e8a74421 1784 for (j = 0; j < c->mbargc; j++)
1785 decrRefCount(c->mbargv[j]);
ed9b544e 1786 c->argc = 0;
e8a74421 1787 c->mbargc = 0;
ed9b544e 1788}
1789
1790static void freeClient(redisClient *c) {
1791 listNode *ln;
1792
4409877e 1793 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1794 * call, we have to set querybuf to NULL *before* to call
1795 * unblockClientWaitingData() to avoid processInputBuffer() will get
1796 * called. Also it is important to remove the file events after
1797 * this, because this call adds the READABLE event. */
4409877e 1798 sdsfree(c->querybuf);
1799 c->querybuf = NULL;
1800 if (c->flags & REDIS_BLOCKED)
b0d8747d 1801 unblockClientWaitingData(c);
4409877e 1802
ed9b544e 1803 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1804 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1805 listRelease(c->reply);
1806 freeClientArgv(c);
1807 close(c->fd);
92f8e882 1808 /* Remove from the list of clients */
ed9b544e 1809 ln = listSearchKey(server.clients,c);
dfc5e96c 1810 redisAssert(ln != NULL);
ed9b544e 1811 listDelNode(server.clients,ln);
d5d55fc3 1812 /* Remove from the list of clients waiting for swapped keys */
1813 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1814 ln = listSearchKey(server.io_ready_clients,c);
1815 if (ln) {
1816 listDelNode(server.io_ready_clients,ln);
1817 server.vm_blocked_clients--;
1818 }
1819 }
1820 while (server.vm_enabled && listLength(c->io_keys)) {
1821 ln = listFirst(c->io_keys);
1822 dontWaitForSwappedKey(c,ln->value);
92f8e882 1823 }
b3e3d0d7 1824 listRelease(c->io_keys);
92f8e882 1825 /* Other cleanup */
ed9b544e 1826 if (c->flags & REDIS_SLAVE) {
6208b3a7 1827 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1828 close(c->repldbfd);
87eca727 1829 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1830 ln = listSearchKey(l,c);
dfc5e96c 1831 redisAssert(ln != NULL);
87eca727 1832 listDelNode(l,ln);
ed9b544e 1833 }
1834 if (c->flags & REDIS_MASTER) {
1835 server.master = NULL;
1836 server.replstate = REDIS_REPL_CONNECT;
1837 }
93ea3759 1838 zfree(c->argv);
e8a74421 1839 zfree(c->mbargv);
6e469882 1840 freeClientMultiState(c);
ed9b544e 1841 zfree(c);
1842}
1843
cc30e368 1844#define GLUEREPLY_UP_TO (1024)
ed9b544e 1845static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1846 int copylen = 0;
1847 char buf[GLUEREPLY_UP_TO];
6208b3a7 1848 listNode *ln;
c7df85a4 1849 listIter li;
ed9b544e 1850 robj *o;
1851
c7df85a4 1852 listRewind(c->reply,&li);
1853 while((ln = listNext(&li))) {
c28b42ac 1854 int objlen;
1855
ed9b544e 1856 o = ln->value;
c28b42ac 1857 objlen = sdslen(o->ptr);
1858 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1859 memcpy(buf+copylen,o->ptr,objlen);
1860 copylen += objlen;
ed9b544e 1861 listDelNode(c->reply,ln);
c28b42ac 1862 } else {
1863 if (copylen == 0) return;
1864 break;
ed9b544e 1865 }
ed9b544e 1866 }
c28b42ac 1867 /* Now the output buffer is empty, add the new single element */
1868 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1869 listAddNodeHead(c->reply,o);
ed9b544e 1870}
1871
1872static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1873 redisClient *c = privdata;
1874 int nwritten = 0, totwritten = 0, objlen;
1875 robj *o;
1876 REDIS_NOTUSED(el);
1877 REDIS_NOTUSED(mask);
1878
2895e862 1879 /* Use writev() if we have enough buffers to send */
7ea870c0 1880 if (!server.glueoutputbuf &&
1881 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1882 !(c->flags & REDIS_MASTER))
2895e862 1883 {
1884 sendReplyToClientWritev(el, fd, privdata, mask);
1885 return;
1886 }
2895e862 1887
ed9b544e 1888 while(listLength(c->reply)) {
c28b42ac 1889 if (server.glueoutputbuf && listLength(c->reply) > 1)
1890 glueReplyBuffersIfNeeded(c);
1891
ed9b544e 1892 o = listNodeValue(listFirst(c->reply));
1893 objlen = sdslen(o->ptr);
1894
1895 if (objlen == 0) {
1896 listDelNode(c->reply,listFirst(c->reply));
1897 continue;
1898 }
1899
1900 if (c->flags & REDIS_MASTER) {
6f376729 1901 /* Don't reply to a master */
ed9b544e 1902 nwritten = objlen - c->sentlen;
1903 } else {
a4d1ba9a 1904 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 1905 if (nwritten <= 0) break;
1906 }
1907 c->sentlen += nwritten;
1908 totwritten += nwritten;
1909 /* If we fully sent the object on head go to the next one */
1910 if (c->sentlen == objlen) {
1911 listDelNode(c->reply,listFirst(c->reply));
1912 c->sentlen = 0;
1913 }
6f376729 1914 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 1915 * bytes, in a single threaded server it's a good idea to serve
6f376729 1916 * other clients as well, even if a very large request comes from
1917 * super fast link that is always able to accept data (in real world
12f9d551 1918 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 1919 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 1920 }
1921 if (nwritten == -1) {
1922 if (errno == EAGAIN) {
1923 nwritten = 0;
1924 } else {
f870935d 1925 redisLog(REDIS_VERBOSE,
ed9b544e 1926 "Error writing to client: %s", strerror(errno));
1927 freeClient(c);
1928 return;
1929 }
1930 }
1931 if (totwritten > 0) c->lastinteraction = time(NULL);
1932 if (listLength(c->reply) == 0) {
1933 c->sentlen = 0;
1934 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1935 }
1936}
1937
2895e862 1938static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1939{
1940 redisClient *c = privdata;
1941 int nwritten = 0, totwritten = 0, objlen, willwrite;
1942 robj *o;
1943 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1944 int offset, ion = 0;
1945 REDIS_NOTUSED(el);
1946 REDIS_NOTUSED(mask);
1947
1948 listNode *node;
1949 while (listLength(c->reply)) {
1950 offset = c->sentlen;
1951 ion = 0;
1952 willwrite = 0;
1953
1954 /* fill-in the iov[] array */
1955 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1956 o = listNodeValue(node);
1957 objlen = sdslen(o->ptr);
1958
1959 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1960 break;
1961
1962 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1963 break; /* no more iovecs */
1964
1965 iov[ion].iov_base = ((char*)o->ptr) + offset;
1966 iov[ion].iov_len = objlen - offset;
1967 willwrite += objlen - offset;
1968 offset = 0; /* just for the first item */
1969 ion++;
1970 }
1971
1972 if(willwrite == 0)
1973 break;
1974
1975 /* write all collected blocks at once */
1976 if((nwritten = writev(fd, iov, ion)) < 0) {
1977 if (errno != EAGAIN) {
f870935d 1978 redisLog(REDIS_VERBOSE,
2895e862 1979 "Error writing to client: %s", strerror(errno));
1980 freeClient(c);
1981 return;
1982 }
1983 break;
1984 }
1985
1986 totwritten += nwritten;
1987 offset = c->sentlen;
1988
1989 /* remove written robjs from c->reply */
1990 while (nwritten && listLength(c->reply)) {
1991 o = listNodeValue(listFirst(c->reply));
1992 objlen = sdslen(o->ptr);
1993
1994 if(nwritten >= objlen - offset) {
1995 listDelNode(c->reply, listFirst(c->reply));
1996 nwritten -= objlen - offset;
1997 c->sentlen = 0;
1998 } else {
1999 /* partial write */
2000 c->sentlen += nwritten;
2001 break;
2002 }
2003 offset = 0;
2004 }
2005 }
2006
2007 if (totwritten > 0)
2008 c->lastinteraction = time(NULL);
2009
2010 if (listLength(c->reply) == 0) {
2011 c->sentlen = 0;
2012 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2013 }
2014}
2015
ed9b544e 2016static struct redisCommand *lookupCommand(char *name) {
2017 int j = 0;
2018 while(cmdTable[j].name != NULL) {
bb0b03a3 2019 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2020 j++;
2021 }
2022 return NULL;
2023}
2024
2025/* resetClient prepare the client to process the next command */
2026static void resetClient(redisClient *c) {
2027 freeClientArgv(c);
2028 c->bulklen = -1;
e8a74421 2029 c->multibulk = 0;
ed9b544e 2030}
2031
6e469882 2032/* Call() is the core of Redis execution of a command */
2033static void call(redisClient *c, struct redisCommand *cmd) {
2034 long long dirty;
2035
2036 dirty = server.dirty;
2037 cmd->proc(c);
2038 if (server.appendonly && server.dirty-dirty)
2039 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2040 if (server.dirty-dirty && listLength(server.slaves))
2041 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2042 if (listLength(server.monitors))
2043 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2044 server.stat_numcommands++;
2045}
2046
ed9b544e 2047/* If this function gets called we already read a whole
2048 * command, argments are in the client argv/argc fields.
2049 * processCommand() execute the command or prepare the
2050 * server for a bulk read from the client.
2051 *
2052 * If 1 is returned the client is still alive and valid and
2053 * and other operations can be performed by the caller. Otherwise
2054 * if 0 is returned the client was destroied (i.e. after QUIT). */
2055static int processCommand(redisClient *c) {
2056 struct redisCommand *cmd;
ed9b544e 2057
3fd78bcd 2058 /* Free some memory if needed (maxmemory setting) */
2059 if (server.maxmemory) freeMemoryIfNeeded();
2060
e8a74421 2061 /* Handle the multi bulk command type. This is an alternative protocol
2062 * supported by Redis in order to receive commands that are composed of
2063 * multiple binary-safe "bulk" arguments. The latency of processing is
2064 * a bit higher but this allows things like multi-sets, so if this
2065 * protocol is used only for MSET and similar commands this is a big win. */
2066 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2067 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2068 if (c->multibulk <= 0) {
2069 resetClient(c);
2070 return 1;
2071 } else {
2072 decrRefCount(c->argv[c->argc-1]);
2073 c->argc--;
2074 return 1;
2075 }
2076 } else if (c->multibulk) {
2077 if (c->bulklen == -1) {
2078 if (((char*)c->argv[0]->ptr)[0] != '$') {
2079 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2080 resetClient(c);
2081 return 1;
2082 } else {
2083 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2084 decrRefCount(c->argv[0]);
2085 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2086 c->argc--;
2087 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2088 resetClient(c);
2089 return 1;
2090 }
2091 c->argc--;
2092 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2093 return 1;
2094 }
2095 } else {
2096 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2097 c->mbargv[c->mbargc] = c->argv[0];
2098 c->mbargc++;
2099 c->argc--;
2100 c->multibulk--;
2101 if (c->multibulk == 0) {
2102 robj **auxargv;
2103 int auxargc;
2104
2105 /* Here we need to swap the multi-bulk argc/argv with the
2106 * normal argc/argv of the client structure. */
2107 auxargv = c->argv;
2108 c->argv = c->mbargv;
2109 c->mbargv = auxargv;
2110
2111 auxargc = c->argc;
2112 c->argc = c->mbargc;
2113 c->mbargc = auxargc;
2114
2115 /* We need to set bulklen to something different than -1
2116 * in order for the code below to process the command without
2117 * to try to read the last argument of a bulk command as
2118 * a special argument. */
2119 c->bulklen = 0;
2120 /* continue below and process the command */
2121 } else {
2122 c->bulklen = -1;
2123 return 1;
2124 }
2125 }
2126 }
2127 /* -- end of multi bulk commands processing -- */
2128
ed9b544e 2129 /* The QUIT command is handled as a special case. Normal command
2130 * procs are unable to close the client connection safely */
bb0b03a3 2131 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2132 freeClient(c);
2133 return 0;
2134 }
d5d55fc3 2135
2136 /* Now lookup the command and check ASAP about trivial error conditions
2137 * such wrong arity, bad command name and so forth. */
ed9b544e 2138 cmd = lookupCommand(c->argv[0]->ptr);
2139 if (!cmd) {
2c14807b 2140 addReplySds(c,
2141 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2142 (char*)c->argv[0]->ptr));
ed9b544e 2143 resetClient(c);
2144 return 1;
2145 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2146 (c->argc < -cmd->arity)) {
454d4e43 2147 addReplySds(c,
2148 sdscatprintf(sdsempty(),
2149 "-ERR wrong number of arguments for '%s' command\r\n",
2150 cmd->name));
ed9b544e 2151 resetClient(c);
2152 return 1;
3fd78bcd 2153 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2154 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2155 resetClient(c);
2156 return 1;
ed9b544e 2157 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2158 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2159 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2160
2161 decrRefCount(c->argv[c->argc-1]);
2162 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2163 c->argc--;
2164 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2165 resetClient(c);
2166 return 1;
2167 }
2168 c->argc--;
2169 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2170 /* It is possible that the bulk read is already in the
8d0490e7 2171 * buffer. Check this condition and handle it accordingly.
2172 * This is just a fast path, alternative to call processInputBuffer().
2173 * It's a good idea since the code is small and this condition
2174 * happens most of the times. */
ed9b544e 2175 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2176 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2177 c->argc++;
2178 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2179 } else {
d5d55fc3 2180 /* Otherwise return... there is to read the last argument
2181 * from the socket. */
ed9b544e 2182 return 1;
2183 }
2184 }
10c43610 2185 /* Let's try to share objects on the command arguments vector */
2186 if (server.shareobjects) {
2187 int j;
2188 for(j = 1; j < c->argc; j++)
2189 c->argv[j] = tryObjectSharing(c->argv[j]);
2190 }
942a3961 2191 /* Let's try to encode the bulk object to save space. */
2192 if (cmd->flags & REDIS_CMD_BULK)
2193 tryObjectEncoding(c->argv[c->argc-1]);
2194
e63943a4 2195 /* Check if the user is authenticated */
2196 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2197 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2198 resetClient(c);
2199 return 1;
2200 }
2201
ed9b544e 2202 /* Exec the command */
18b6cb76 2203 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2204 queueMultiCommand(c,cmd);
2205 addReply(c,shared.queued);
2206 } else {
d5d55fc3 2207 if (server.vm_enabled && server.vm_max_threads > 0 &&
2208 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2209 call(c,cmd);
2210 }
ed9b544e 2211
2212 /* Prepare the client for the next command */
ed9b544e 2213 resetClient(c);
2214 return 1;
2215}
2216
87eca727 2217static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6208b3a7 2218 listNode *ln;
c7df85a4 2219 listIter li;
ed9b544e 2220 int outc = 0, j;
93ea3759 2221 robj **outv;
2222 /* (args*2)+1 is enough room for args, spaces, newlines */
2223 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2224
2225 if (argc <= REDIS_STATIC_ARGS) {
2226 outv = static_outv;
2227 } else {
2228 outv = zmalloc(sizeof(robj*)*(argc*2+1));
93ea3759 2229 }
ed9b544e 2230
2231 for (j = 0; j < argc; j++) {
2232 if (j != 0) outv[outc++] = shared.space;
2233 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2234 robj *lenobj;
2235
2236 lenobj = createObject(REDIS_STRING,
682ac724 2237 sdscatprintf(sdsempty(),"%lu\r\n",
83c6a618 2238 (unsigned long) stringObjectLen(argv[j])));
ed9b544e 2239 lenobj->refcount = 0;
2240 outv[outc++] = lenobj;
2241 }
2242 outv[outc++] = argv[j];
2243 }
2244 outv[outc++] = shared.crlf;
2245
40d224a9 2246 /* Increment all the refcounts at start and decrement at end in order to
2247 * be sure to free objects if there is no slave in a replication state
2248 * able to be feed with commands */
2249 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2250 listRewind(slaves,&li);
2251 while((ln = listNext(&li))) {
ed9b544e 2252 redisClient *slave = ln->value;
40d224a9 2253
2254 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2255 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2256
2257 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2258 if (slave->slaveseldb != dictid) {
2259 robj *selectcmd;
2260
2261 switch(dictid) {
2262 case 0: selectcmd = shared.select0; break;
2263 case 1: selectcmd = shared.select1; break;
2264 case 2: selectcmd = shared.select2; break;
2265 case 3: selectcmd = shared.select3; break;
2266 case 4: selectcmd = shared.select4; break;
2267 case 5: selectcmd = shared.select5; break;
2268 case 6: selectcmd = shared.select6; break;
2269 case 7: selectcmd = shared.select7; break;
2270 case 8: selectcmd = shared.select8; break;
2271 case 9: selectcmd = shared.select9; break;
2272 default:
2273 selectcmd = createObject(REDIS_STRING,
2274 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2275 selectcmd->refcount = 0;
2276 break;
2277 }
2278 addReply(slave,selectcmd);
2279 slave->slaveseldb = dictid;
2280 }
2281 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2282 }
40d224a9 2283 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2284 if (outv != static_outv) zfree(outv);
ed9b544e 2285}
2286
638e42ac 2287static void processInputBuffer(redisClient *c) {
ed9b544e 2288again:
4409877e 2289 /* Before to process the input buffer, make sure the client is not
2290 * waitig for a blocking operation such as BLPOP. Note that the first
2291 * iteration the client is never blocked, otherwise the processInputBuffer
2292 * would not be called at all, but after the execution of the first commands
2293 * in the input buffer the client may be blocked, and the "goto again"
2294 * will try to reiterate. The following line will make it return asap. */
92f8e882 2295 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2296 if (c->bulklen == -1) {
2297 /* Read the first line of the query */
2298 char *p = strchr(c->querybuf,'\n');
2299 size_t querylen;
644fafa3 2300
ed9b544e 2301 if (p) {
2302 sds query, *argv;
2303 int argc, j;
2304
2305 query = c->querybuf;
2306 c->querybuf = sdsempty();
2307 querylen = 1+(p-(query));
2308 if (sdslen(query) > querylen) {
2309 /* leave data after the first line of the query in the buffer */
2310 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2311 }
2312 *p = '\0'; /* remove "\n" */
2313 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2314 sdsupdatelen(query);
2315
2316 /* Now we can split the query in arguments */
ed9b544e 2317 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2318 sdsfree(query);
2319
2320 if (c->argv) zfree(c->argv);
2321 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2322
2323 for (j = 0; j < argc; j++) {
ed9b544e 2324 if (sdslen(argv[j])) {
2325 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2326 c->argc++;
2327 } else {
2328 sdsfree(argv[j]);
2329 }
2330 }
2331 zfree(argv);
7c49733c 2332 if (c->argc) {
2333 /* Execute the command. If the client is still valid
2334 * after processCommand() return and there is something
2335 * on the query buffer try to process the next command. */
2336 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2337 } else {
2338 /* Nothing to process, argc == 0. Just process the query
2339 * buffer if it's not empty or return to the caller */
2340 if (sdslen(c->querybuf)) goto again;
2341 }
ed9b544e 2342 return;
644fafa3 2343 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2344 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2345 freeClient(c);
2346 return;
2347 }
2348 } else {
2349 /* Bulk read handling. Note that if we are at this point
2350 the client already sent a command terminated with a newline,
2351 we are reading the bulk data that is actually the last
2352 argument of the command. */
2353 int qbl = sdslen(c->querybuf);
2354
2355 if (c->bulklen <= qbl) {
2356 /* Copy everything but the final CRLF as final argument */
2357 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2358 c->argc++;
2359 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2360 /* Process the command. If the client is still valid after
2361 * the processing and there is more data in the buffer
2362 * try to parse it. */
2363 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2364 return;
2365 }
2366 }
2367}
2368
638e42ac 2369static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2370 redisClient *c = (redisClient*) privdata;
2371 char buf[REDIS_IOBUF_LEN];
2372 int nread;
2373 REDIS_NOTUSED(el);
2374 REDIS_NOTUSED(mask);
2375
2376 nread = read(fd, buf, REDIS_IOBUF_LEN);
2377 if (nread == -1) {
2378 if (errno == EAGAIN) {
2379 nread = 0;
2380 } else {
f870935d 2381 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2382 freeClient(c);
2383 return;
2384 }
2385 } else if (nread == 0) {
f870935d 2386 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2387 freeClient(c);
2388 return;
2389 }
2390 if (nread) {
2391 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2392 c->lastinteraction = time(NULL);
2393 } else {
2394 return;
2395 }
5921aa36 2396 if (!(c->flags & REDIS_BLOCKED))
2397 processInputBuffer(c);
638e42ac 2398}
2399
ed9b544e 2400static int selectDb(redisClient *c, int id) {
2401 if (id < 0 || id >= server.dbnum)
2402 return REDIS_ERR;
3305306f 2403 c->db = &server.db[id];
ed9b544e 2404 return REDIS_OK;
2405}
2406
40d224a9 2407static void *dupClientReplyValue(void *o) {
2408 incrRefCount((robj*)o);
12d090d2 2409 return o;
40d224a9 2410}
2411
ed9b544e 2412static redisClient *createClient(int fd) {
2413 redisClient *c = zmalloc(sizeof(*c));
2414
2415 anetNonBlock(NULL,fd);
2416 anetTcpNoDelay(NULL,fd);
2417 if (!c) return NULL;
2418 selectDb(c,0);
2419 c->fd = fd;
2420 c->querybuf = sdsempty();
2421 c->argc = 0;
93ea3759 2422 c->argv = NULL;
ed9b544e 2423 c->bulklen = -1;
e8a74421 2424 c->multibulk = 0;
2425 c->mbargc = 0;
2426 c->mbargv = NULL;
ed9b544e 2427 c->sentlen = 0;
2428 c->flags = 0;
2429 c->lastinteraction = time(NULL);
abcb223e 2430 c->authenticated = 0;
40d224a9 2431 c->replstate = REDIS_REPL_NONE;
6b47e12e 2432 c->reply = listCreate();
ed9b544e 2433 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2434 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2435 c->blockingkeys = NULL;
2436 c->blockingkeysnum = 0;
2437 c->io_keys = listCreate();
2438 listSetFreeMethod(c->io_keys,decrRefCount);
ed9b544e 2439 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2440 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2441 freeClient(c);
2442 return NULL;
2443 }
6b47e12e 2444 listAddNodeTail(server.clients,c);
6e469882 2445 initClientMultiState(c);
ed9b544e 2446 return c;
2447}
2448
2449static void addReply(redisClient *c, robj *obj) {
2450 if (listLength(c->reply) == 0 &&
6208b3a7 2451 (c->replstate == REDIS_REPL_NONE ||
2452 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2453 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2454 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2455
2456 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2457 obj = dupStringObject(obj);
2458 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2459 }
9d65a1bb 2460 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2461}
2462
2463static void addReplySds(redisClient *c, sds s) {
2464 robj *o = createObject(REDIS_STRING,s);
2465 addReply(c,o);
2466 decrRefCount(o);
2467}
2468
e2665397 2469static void addReplyDouble(redisClient *c, double d) {
2470 char buf[128];
2471
2472 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2473 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2474 (unsigned long) strlen(buf),buf));
e2665397 2475}
2476
f44dd428 2477static void addReplyLong(redisClient *c, long l) {
2478 char buf[128];
2479 size_t len;
2480
2481 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2482 addReplySds(c,sdsnewlen(buf,len));
2483}
2484
92b27fe9 2485static void addReplyUlong(redisClient *c, unsigned long ul) {
2486 char buf[128];
2487 size_t len;
2488
2489 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2490 addReplySds(c,sdsnewlen(buf,len));
2491}
2492
942a3961 2493static void addReplyBulkLen(redisClient *c, robj *obj) {
2494 size_t len;
2495
2496 if (obj->encoding == REDIS_ENCODING_RAW) {
2497 len = sdslen(obj->ptr);
2498 } else {
2499 long n = (long)obj->ptr;
2500
e054afda 2501 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2502 len = 1;
2503 if (n < 0) {
2504 len++;
2505 n = -n;
2506 }
2507 while((n = n/10) != 0) {
2508 len++;
2509 }
2510 }
83c6a618 2511 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2512}
2513
ed9b544e 2514static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2515 int cport, cfd;
2516 char cip[128];
285add55 2517 redisClient *c;
ed9b544e 2518 REDIS_NOTUSED(el);
2519 REDIS_NOTUSED(mask);
2520 REDIS_NOTUSED(privdata);
2521
2522 cfd = anetAccept(server.neterr, fd, cip, &cport);
2523 if (cfd == AE_ERR) {
f870935d 2524 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2525 return;
2526 }
f870935d 2527 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2528 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2529 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2530 close(cfd); /* May be already closed, just ingore errors */
2531 return;
2532 }
285add55 2533 /* If maxclient directive is set and this is one client more... close the
2534 * connection. Note that we create the client instead to check before
2535 * for this condition, since now the socket is already set in nonblocking
2536 * mode and we can send an error for free using the Kernel I/O */
2537 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2538 char *err = "-ERR max number of clients reached\r\n";
2539
2540 /* That's a best effort error message, don't check write errors */
fee803ba 2541 if (write(c->fd,err,strlen(err)) == -1) {
2542 /* Nothing to do, Just to avoid the warning... */
2543 }
285add55 2544 freeClient(c);
2545 return;
2546 }
ed9b544e 2547 server.stat_numconnections++;
2548}
2549
2550/* ======================= Redis objects implementation ===================== */
2551
2552static robj *createObject(int type, void *ptr) {
2553 robj *o;
2554
a5819310 2555 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2556 if (listLength(server.objfreelist)) {
2557 listNode *head = listFirst(server.objfreelist);
2558 o = listNodeValue(head);
2559 listDelNode(server.objfreelist,head);
a5819310 2560 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2561 } else {
75680a3c 2562 if (server.vm_enabled) {
a5819310 2563 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2564 o = zmalloc(sizeof(*o));
2565 } else {
2566 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2567 }
ed9b544e 2568 }
ed9b544e 2569 o->type = type;
942a3961 2570 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2571 o->ptr = ptr;
2572 o->refcount = 1;
3a66edc7 2573 if (server.vm_enabled) {
1064ef87 2574 /* Note that this code may run in the context of an I/O thread
2575 * and accessing to server.unixtime in theory is an error
2576 * (no locks). But in practice this is safe, and even if we read
2577 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2578 o->vm.atime = server.unixtime;
2579 o->storage = REDIS_VM_MEMORY;
2580 }
ed9b544e 2581 return o;
2582}
2583
2584static robj *createStringObject(char *ptr, size_t len) {
2585 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2586}
2587
4ef8de8a 2588static robj *dupStringObject(robj *o) {
b9bc0eef 2589 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2590 return createStringObject(o->ptr,sdslen(o->ptr));
2591}
2592
ed9b544e 2593static robj *createListObject(void) {
2594 list *l = listCreate();
2595
ed9b544e 2596 listSetFreeMethod(l,decrRefCount);
2597 return createObject(REDIS_LIST,l);
2598}
2599
2600static robj *createSetObject(void) {
2601 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2602 return createObject(REDIS_SET,d);
2603}
2604
5234952b 2605static robj *createHashObject(void) {
2606 /* All the Hashes start as zipmaps. Will be automatically converted
2607 * into hash tables if there are enough elements or big elements
2608 * inside. */
2609 unsigned char *zm = zipmapNew();
2610 robj *o = createObject(REDIS_HASH,zm);
2611 o->encoding = REDIS_ENCODING_ZIPMAP;
2612 return o;
2613}
2614
1812e024 2615static robj *createZsetObject(void) {
6b47e12e 2616 zset *zs = zmalloc(sizeof(*zs));
2617
2618 zs->dict = dictCreate(&zsetDictType,NULL);
2619 zs->zsl = zslCreate();
2620 return createObject(REDIS_ZSET,zs);
1812e024 2621}
2622
ed9b544e 2623static void freeStringObject(robj *o) {
942a3961 2624 if (o->encoding == REDIS_ENCODING_RAW) {
2625 sdsfree(o->ptr);
2626 }
ed9b544e 2627}
2628
2629static void freeListObject(robj *o) {
2630 listRelease((list*) o->ptr);
2631}
2632
2633static void freeSetObject(robj *o) {
2634 dictRelease((dict*) o->ptr);
2635}
2636
fd8ccf44 2637static void freeZsetObject(robj *o) {
2638 zset *zs = o->ptr;
2639
2640 dictRelease(zs->dict);
2641 zslFree(zs->zsl);
2642 zfree(zs);
2643}
2644
ed9b544e 2645static void freeHashObject(robj *o) {
cbba7dd7 2646 switch (o->encoding) {
2647 case REDIS_ENCODING_HT:
2648 dictRelease((dict*) o->ptr);
2649 break;
2650 case REDIS_ENCODING_ZIPMAP:
2651 zfree(o->ptr);
2652 break;
2653 default:
2654 redisAssert(0);
2655 break;
2656 }
ed9b544e 2657}
2658
2659static void incrRefCount(robj *o) {
f2b8ab34 2660 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
ed9b544e 2661 o->refcount++;
2662}
2663
2664static void decrRefCount(void *obj) {
2665 robj *o = obj;
94754ccc 2666
970e10bb 2667 /* Object is a key of a swapped out value, or in the process of being
2668 * loaded. */
996cb5f7 2669 if (server.vm_enabled &&
2670 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2671 {
2672 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2673 redisAssert(o->refcount == 1);
2674 }
2675 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2676 redisAssert(o->type == REDIS_STRING);
a35ddf12 2677 freeStringObject(o);
2678 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2679 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2680 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2681 !listAddNodeHead(server.objfreelist,o))
2682 zfree(o);
a5819310 2683 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2684 server.vm_stats_swapped_objects--;
a35ddf12 2685 return;
2686 }
996cb5f7 2687 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2688 if (--(o->refcount) == 0) {
996cb5f7 2689 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2690 vmCancelThreadedIOJob(obj);
ed9b544e 2691 switch(o->type) {
2692 case REDIS_STRING: freeStringObject(o); break;
2693 case REDIS_LIST: freeListObject(o); break;
2694 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2695 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2696 case REDIS_HASH: freeHashObject(o); break;
dfc5e96c 2697 default: redisAssert(0 != 0); break;
ed9b544e 2698 }
a5819310 2699 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2700 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2701 !listAddNodeHead(server.objfreelist,o))
2702 zfree(o);
a5819310 2703 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2704 }
2705}
2706
942a3961 2707static robj *lookupKey(redisDb *db, robj *key) {
2708 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2709 if (de) {
55cf8433 2710 robj *key = dictGetEntryKey(de);
2711 robj *val = dictGetEntryVal(de);
3a66edc7 2712
55cf8433 2713 if (server.vm_enabled) {
996cb5f7 2714 if (key->storage == REDIS_VM_MEMORY ||
2715 key->storage == REDIS_VM_SWAPPING)
2716 {
2717 /* If we were swapping the object out, stop it, this key
2718 * was requested. */
2719 if (key->storage == REDIS_VM_SWAPPING)
2720 vmCancelThreadedIOJob(key);
55cf8433 2721 /* Update the access time of the key for the aging algorithm. */
2722 key->vm.atime = server.unixtime;
2723 } else {
d5d55fc3 2724 int notify = (key->storage == REDIS_VM_LOADING);
2725
55cf8433 2726 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2727 redisAssert(val == NULL);
55cf8433 2728 val = vmLoadObject(key);
2729 dictGetEntryVal(de) = val;
d5d55fc3 2730
2731 /* Clients blocked by the VM subsystem may be waiting for
2732 * this key... */
2733 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 2734 }
2735 }
2736 return val;
3a66edc7 2737 } else {
2738 return NULL;
2739 }
942a3961 2740}
2741
2742static robj *lookupKeyRead(redisDb *db, robj *key) {
2743 expireIfNeeded(db,key);
2744 return lookupKey(db,key);
2745}
2746
2747static robj *lookupKeyWrite(redisDb *db, robj *key) {
2748 deleteIfVolatile(db,key);
2749 return lookupKey(db,key);
2750}
2751
92b27fe9 2752static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2753 robj *o = lookupKeyRead(c->db, key);
2754 if (!o) addReply(c,reply);
2755 return o;
2756}
2757
2758static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2759 robj *o = lookupKeyWrite(c->db, key);
2760 if (!o) addReply(c,reply);
2761 return o;
2762}
2763
2764static int checkType(redisClient *c, robj *o, int type) {
2765 if (o->type != type) {
2766 addReply(c,shared.wrongtypeerr);
2767 return 1;
2768 }
2769 return 0;
2770}
2771
942a3961 2772static int deleteKey(redisDb *db, robj *key) {
2773 int retval;
2774
2775 /* We need to protect key from destruction: after the first dictDelete()
2776 * it may happen that 'key' is no longer valid if we don't increment
2777 * it's count. This may happen when we get the object reference directly
2778 * from the hash table with dictRandomKey() or dict iterators */
2779 incrRefCount(key);
2780 if (dictSize(db->expires)) dictDelete(db->expires,key);
2781 retval = dictDelete(db->dict,key);
2782 decrRefCount(key);
2783
2784 return retval == DICT_OK;
2785}
2786
10c43610 2787/* Try to share an object against the shared objects pool */
2788static robj *tryObjectSharing(robj *o) {
2789 struct dictEntry *de;
2790 unsigned long c;
2791
3305306f 2792 if (o == NULL || server.shareobjects == 0) return o;
10c43610 2793
dfc5e96c 2794 redisAssert(o->type == REDIS_STRING);
10c43610 2795 de = dictFind(server.sharingpool,o);
2796 if (de) {
2797 robj *shared = dictGetEntryKey(de);
2798
2799 c = ((unsigned long) dictGetEntryVal(de))+1;
2800 dictGetEntryVal(de) = (void*) c;
2801 incrRefCount(shared);
2802 decrRefCount(o);
2803 return shared;
2804 } else {
2805 /* Here we are using a stream algorihtm: Every time an object is
2806 * shared we increment its count, everytime there is a miss we
2807 * recrement the counter of a random object. If this object reaches
2808 * zero we remove the object and put the current object instead. */
3305306f 2809 if (dictSize(server.sharingpool) >=
10c43610 2810 server.sharingpoolsize) {
2811 de = dictGetRandomKey(server.sharingpool);
dfc5e96c 2812 redisAssert(de != NULL);
10c43610 2813 c = ((unsigned long) dictGetEntryVal(de))-1;
2814 dictGetEntryVal(de) = (void*) c;
2815 if (c == 0) {
2816 dictDelete(server.sharingpool,de->key);
2817 }
2818 } else {
2819 c = 0; /* If the pool is empty we want to add this object */
2820 }
2821 if (c == 0) {
2822 int retval;
2823
2824 retval = dictAdd(server.sharingpool,o,(void*)1);
dfc5e96c 2825 redisAssert(retval == DICT_OK);
10c43610 2826 incrRefCount(o);
2827 }
2828 return o;
2829 }
2830}
2831
724a51b1 2832/* Check if the nul-terminated string 's' can be represented by a long
2833 * (that is, is a number that fits into long without any other space or
2834 * character before or after the digits).
2835 *
2836 * If so, the function returns REDIS_OK and *longval is set to the value
2837 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2838static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2839 char buf[32], *endptr;
2840 long value;
2841 int slen;
2842
2843 value = strtol(s, &endptr, 10);
2844 if (endptr[0] != '\0') return REDIS_ERR;
2845 slen = snprintf(buf,32,"%ld",value);
2846
2847 /* If the number converted back into a string is not identical
2848 * then it's not possible to encode the string as integer */
f69f2cba 2849 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2850 if (longval) *longval = value;
2851 return REDIS_OK;
2852}
2853
942a3961 2854/* Try to encode a string object in order to save space */
2855static int tryObjectEncoding(robj *o) {
2856 long value;
942a3961 2857 sds s = o->ptr;
3305306f 2858
942a3961 2859 if (o->encoding != REDIS_ENCODING_RAW)
2860 return REDIS_ERR; /* Already encoded */
3305306f 2861
942a3961 2862 /* It's not save to encode shared objects: shared objects can be shared
2863 * everywhere in the "object space" of Redis. Encoded objects can only
2864 * appear as "values" (and not, for instance, as keys) */
2865 if (o->refcount > 1) return REDIS_ERR;
3305306f 2866
942a3961 2867 /* Currently we try to encode only strings */
dfc5e96c 2868 redisAssert(o->type == REDIS_STRING);
94754ccc 2869
724a51b1 2870 /* Check if we can represent this string as a long integer */
2871 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
942a3961 2872
2873 /* Ok, this object can be encoded */
2874 o->encoding = REDIS_ENCODING_INT;
2875 sdsfree(o->ptr);
2876 o->ptr = (void*) value;
2877 return REDIS_OK;
2878}
2879
9d65a1bb 2880/* Get a decoded version of an encoded object (returned as a new object).
2881 * If the object is already raw-encoded just increment the ref count. */
2882static robj *getDecodedObject(robj *o) {
942a3961 2883 robj *dec;
2884
9d65a1bb 2885 if (o->encoding == REDIS_ENCODING_RAW) {
2886 incrRefCount(o);
2887 return o;
2888 }
942a3961 2889 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2890 char buf[32];
2891
2892 snprintf(buf,32,"%ld",(long)o->ptr);
2893 dec = createStringObject(buf,strlen(buf));
2894 return dec;
2895 } else {
dfc5e96c 2896 redisAssert(1 != 1);
942a3961 2897 }
3305306f 2898}
2899
d7f43c08 2900/* Compare two string objects via strcmp() or alike.
2901 * Note that the objects may be integer-encoded. In such a case we
2902 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 2903 * and compare the strings, it's much faster than calling getDecodedObject().
2904 *
2905 * Important note: if objects are not integer encoded, but binary-safe strings,
2906 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2907 * binary safe. */
724a51b1 2908static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 2909 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 2910 char bufa[128], bufb[128], *astr, *bstr;
2911 int bothsds = 1;
724a51b1 2912
e197b441 2913 if (a == b) return 0;
d7f43c08 2914 if (a->encoding != REDIS_ENCODING_RAW) {
2915 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2916 astr = bufa;
2917 bothsds = 0;
724a51b1 2918 } else {
d7f43c08 2919 astr = a->ptr;
724a51b1 2920 }
d7f43c08 2921 if (b->encoding != REDIS_ENCODING_RAW) {
2922 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2923 bstr = bufb;
2924 bothsds = 0;
2925 } else {
2926 bstr = b->ptr;
2927 }
2928 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 2929}
2930
0ea663ea 2931static size_t stringObjectLen(robj *o) {
dfc5e96c 2932 redisAssert(o->type == REDIS_STRING);
0ea663ea 2933 if (o->encoding == REDIS_ENCODING_RAW) {
2934 return sdslen(o->ptr);
2935 } else {
2936 char buf[32];
2937
2938 return snprintf(buf,32,"%ld",(long)o->ptr);
2939 }
2940}
2941
06233c45 2942/*============================ RDB saving/loading =========================== */
ed9b544e 2943
f78fd11b 2944static int rdbSaveType(FILE *fp, unsigned char type) {
2945 if (fwrite(&type,1,1,fp) == 0) return -1;
2946 return 0;
2947}
2948
bb32ede5 2949static int rdbSaveTime(FILE *fp, time_t t) {
2950 int32_t t32 = (int32_t) t;
2951 if (fwrite(&t32,4,1,fp) == 0) return -1;
2952 return 0;
2953}
2954
e3566d4b 2955/* check rdbLoadLen() comments for more info */
f78fd11b 2956static int rdbSaveLen(FILE *fp, uint32_t len) {
2957 unsigned char buf[2];
2958
2959 if (len < (1<<6)) {
2960 /* Save a 6 bit len */
10c43610 2961 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 2962 if (fwrite(buf,1,1,fp) == 0) return -1;
2963 } else if (len < (1<<14)) {
2964 /* Save a 14 bit len */
10c43610 2965 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 2966 buf[1] = len&0xFF;
17be1a4a 2967 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 2968 } else {
2969 /* Save a 32 bit len */
10c43610 2970 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 2971 if (fwrite(buf,1,1,fp) == 0) return -1;
2972 len = htonl(len);
2973 if (fwrite(&len,4,1,fp) == 0) return -1;
2974 }
2975 return 0;
2976}
2977
e3566d4b 2978/* String objects in the form "2391" "-100" without any space and with a
2979 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2980 * encoded as integers to save space */
b1befe6a 2981static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
e3566d4b 2982 long long value;
2983 char *endptr, buf[32];
2984
2985 /* Check if it's possible to encode this value as a number */
2986 value = strtoll(s, &endptr, 10);
2987 if (endptr[0] != '\0') return 0;
2988 snprintf(buf,32,"%lld",value);
2989
2990 /* If the number converted back into a string is not identical
2991 * then it's not possible to encode the string as integer */
b1befe6a 2992 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
e3566d4b 2993
2994 /* Finally check if it fits in our ranges */
2995 if (value >= -(1<<7) && value <= (1<<7)-1) {
2996 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2997 enc[1] = value&0xFF;
2998 return 2;
2999 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3000 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3001 enc[1] = value&0xFF;
3002 enc[2] = (value>>8)&0xFF;
3003 return 3;
3004 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3005 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3006 enc[1] = value&0xFF;
3007 enc[2] = (value>>8)&0xFF;
3008 enc[3] = (value>>16)&0xFF;
3009 enc[4] = (value>>24)&0xFF;
3010 return 5;
3011 } else {
3012 return 0;
3013 }
3014}
3015
b1befe6a 3016static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3017 size_t comprlen, outlen;
774e3047 3018 unsigned char byte;
3019 void *out;
3020
3021 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3022 if (len <= 4) return 0;
3023 outlen = len-4;
3a2694c4 3024 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3025 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3026 if (comprlen == 0) {
88e85998 3027 zfree(out);
774e3047 3028 return 0;
3029 }
3030 /* Data compressed! Let's save it on disk */
3031 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3032 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3033 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3034 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3035 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3036 zfree(out);
774e3047 3037 return comprlen;
3038
3039writeerr:
88e85998 3040 zfree(out);
774e3047 3041 return -1;
3042}
3043
e3566d4b 3044/* Save a string objet as [len][data] on disk. If the object is a string
3045 * representation of an integer value we try to safe it in a special form */
b1befe6a 3046static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3047 int enclen;
10c43610 3048
774e3047 3049 /* Try integer encoding */
e3566d4b 3050 if (len <= 11) {
3051 unsigned char buf[5];
b1befe6a 3052 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3053 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3054 return 0;
3055 }
3056 }
774e3047 3057
3058 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3059 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3060 if (server.rdbcompression && len > 20) {
774e3047 3061 int retval;
3062
b1befe6a 3063 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3064 if (retval == -1) return -1;
3065 if (retval > 0) return 0;
3066 /* retval == 0 means data can't be compressed, save the old way */
3067 }
3068
3069 /* Store verbatim */
10c43610 3070 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3071 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3072 return 0;
3073}
3074
942a3961 3075/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3076static int rdbSaveStringObject(FILE *fp, robj *obj) {
3077 int retval;
942a3961 3078
f2d9f50f 3079 /* Avoid incr/decr ref count business when possible.
3080 * This plays well with copy-on-write given that we are probably
3081 * in a child process (BGSAVE). Also this makes sure key objects
3082 * of swapped objects are not incRefCount-ed (an assert does not allow
3083 * this in order to avoid bugs) */
3084 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3085 obj = getDecodedObject(obj);
b1befe6a 3086 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3087 decrRefCount(obj);
3088 } else {
b1befe6a 3089 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3090 }
9d65a1bb 3091 return retval;
942a3961 3092}
3093
a7866db6 3094/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3095 * 8 bit integer specifing the length of the representation.
3096 * This 8 bit integer has special values in order to specify the following
3097 * conditions:
3098 * 253: not a number
3099 * 254: + inf
3100 * 255: - inf
3101 */
3102static int rdbSaveDoubleValue(FILE *fp, double val) {
3103 unsigned char buf[128];
3104 int len;
3105
3106 if (isnan(val)) {
3107 buf[0] = 253;
3108 len = 1;
3109 } else if (!isfinite(val)) {
3110 len = 1;
3111 buf[0] = (val < 0) ? 255 : 254;
3112 } else {
eaa256ad 3113 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3114 buf[0] = strlen((char*)buf+1);
a7866db6 3115 len = buf[0]+1;
3116 }
3117 if (fwrite(buf,len,1,fp) == 0) return -1;
3118 return 0;
3119}
3120
06233c45 3121/* Save a Redis object. */
3122static int rdbSaveObject(FILE *fp, robj *o) {
3123 if (o->type == REDIS_STRING) {
3124 /* Save a string value */
3125 if (rdbSaveStringObject(fp,o) == -1) return -1;
3126 } else if (o->type == REDIS_LIST) {
3127 /* Save a list value */
3128 list *list = o->ptr;
c7df85a4 3129 listIter li;
06233c45 3130 listNode *ln;
3131
06233c45 3132 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3133 listRewind(list,&li);
3134 while((ln = listNext(&li))) {
06233c45 3135 robj *eleobj = listNodeValue(ln);
3136
3137 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3138 }
3139 } else if (o->type == REDIS_SET) {
3140 /* Save a set value */
3141 dict *set = o->ptr;
3142 dictIterator *di = dictGetIterator(set);
3143 dictEntry *de;
3144
3145 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3146 while((de = dictNext(di)) != NULL) {
3147 robj *eleobj = dictGetEntryKey(de);
3148
3149 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3150 }
3151 dictReleaseIterator(di);
3152 } else if (o->type == REDIS_ZSET) {
3153 /* Save a set value */
3154 zset *zs = o->ptr;
3155 dictIterator *di = dictGetIterator(zs->dict);
3156 dictEntry *de;
3157
3158 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3159 while((de = dictNext(di)) != NULL) {
3160 robj *eleobj = dictGetEntryKey(de);
3161 double *score = dictGetEntryVal(de);
3162
3163 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3164 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3165 }
3166 dictReleaseIterator(di);
b1befe6a 3167 } else if (o->type == REDIS_HASH) {
3168 /* Save a hash value */
3169 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3170 unsigned char *p = zipmapRewind(o->ptr);
3171 unsigned int count = zipmapLen(o->ptr);
3172 unsigned char *key, *val;
3173 unsigned int klen, vlen;
3174
3175 if (rdbSaveLen(fp,count) == -1) return -1;
3176 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3177 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3178 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3179 }
3180 } else {
3181 dictIterator *di = dictGetIterator(o->ptr);
3182 dictEntry *de;
3183
3184 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3185 while((de = dictNext(di)) != NULL) {
3186 robj *key = dictGetEntryKey(de);
3187 robj *val = dictGetEntryVal(de);
3188
3189 if (rdbSaveStringObject(fp,key) == -1) return -1;
3190 if (rdbSaveStringObject(fp,val) == -1) return -1;
3191 }
3192 dictReleaseIterator(di);
3193 }
06233c45 3194 } else {
3195 redisAssert(0 != 0);
3196 }
3197 return 0;
3198}
3199
3200/* Return the length the object will have on disk if saved with
3201 * the rdbSaveObject() function. Currently we use a trick to get
3202 * this length with very little changes to the code. In the future
3203 * we could switch to a faster solution. */
b9bc0eef 3204static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3205 if (fp == NULL) fp = server.devnull;
06233c45 3206 rewind(fp);
3207 assert(rdbSaveObject(fp,o) != 1);
3208 return ftello(fp);
3209}
3210
06224fec 3211/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3212static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3213 off_t bytes = rdbSavedObjectLen(o,fp);
06224fec 3214
3215 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3216}
3217
ed9b544e 3218/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3219static int rdbSave(char *filename) {
ed9b544e 3220 dictIterator *di = NULL;
3221 dictEntry *de;
ed9b544e 3222 FILE *fp;
3223 char tmpfile[256];
3224 int j;
bb32ede5 3225 time_t now = time(NULL);
ed9b544e 3226
2316bb3b 3227 /* Wait for I/O therads to terminate, just in case this is a
3228 * foreground-saving, to avoid seeking the swap file descriptor at the
3229 * same time. */
3230 if (server.vm_enabled)
3231 waitEmptyIOJobsQueue();
3232
a3b21203 3233 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3234 fp = fopen(tmpfile,"w");
3235 if (!fp) {
3236 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3237 return REDIS_ERR;
3238 }
f78fd11b 3239 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3240 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3241 redisDb *db = server.db+j;
3242 dict *d = db->dict;
3305306f 3243 if (dictSize(d) == 0) continue;
ed9b544e 3244 di = dictGetIterator(d);
3245 if (!di) {
3246 fclose(fp);
3247 return REDIS_ERR;
3248 }
3249
3250 /* Write the SELECT DB opcode */
f78fd11b 3251 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3252 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3253
3254 /* Iterate this DB writing every entry */
3255 while((de = dictNext(di)) != NULL) {
3256 robj *key = dictGetEntryKey(de);
3257 robj *o = dictGetEntryVal(de);
bb32ede5 3258 time_t expiretime = getExpire(db,key);
3259
3260 /* Save the expire time */
3261 if (expiretime != -1) {
3262 /* If this key is already expired skip it */
3263 if (expiretime < now) continue;
3264 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3265 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3266 }
7e69548d 3267 /* Save the key and associated value. This requires special
3268 * handling if the value is swapped out. */
996cb5f7 3269 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3270 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3271 /* Save type, key, value */
3272 if (rdbSaveType(fp,o->type) == -1) goto werr;
3273 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3274 if (rdbSaveObject(fp,o) == -1) goto werr;
3275 } else {
996cb5f7 3276 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3277 robj *po;
7e69548d 3278 /* Get a preview of the object in memory */
3279 po = vmPreviewObject(key);
7e69548d 3280 /* Save type, key, value */
3281 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3282 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3283 if (rdbSaveObject(fp,po) == -1) goto werr;
3284 /* Remove the loaded object from memory */
3285 decrRefCount(po);
7e69548d 3286 }
ed9b544e 3287 }
3288 dictReleaseIterator(di);
3289 }
3290 /* EOF opcode */
f78fd11b 3291 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3292
3293 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3294 fflush(fp);
3295 fsync(fileno(fp));
3296 fclose(fp);
3297
3298 /* Use RENAME to make sure the DB file is changed atomically only
3299 * if the generate DB file is ok. */
3300 if (rename(tmpfile,filename) == -1) {
325d1eb4 3301 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3302 unlink(tmpfile);
3303 return REDIS_ERR;
3304 }
3305 redisLog(REDIS_NOTICE,"DB saved on disk");
3306 server.dirty = 0;
3307 server.lastsave = time(NULL);
3308 return REDIS_OK;
3309
3310werr:
3311 fclose(fp);
3312 unlink(tmpfile);
3313 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3314 if (di) dictReleaseIterator(di);
3315 return REDIS_ERR;
3316}
3317
f78fd11b 3318static int rdbSaveBackground(char *filename) {
ed9b544e 3319 pid_t childpid;
3320
9d65a1bb 3321 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3322 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3323 if ((childpid = fork()) == 0) {
3324 /* Child */
054e426d 3325 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3326 close(server.fd);
f78fd11b 3327 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3328 _exit(0);
ed9b544e 3329 } else {
478c2c6f 3330 _exit(1);
ed9b544e 3331 }
3332 } else {
3333 /* Parent */
5a7c647e 3334 if (childpid == -1) {
3335 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3336 strerror(errno));
3337 return REDIS_ERR;
3338 }
ed9b544e 3339 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3340 server.bgsavechildpid = childpid;
ed9b544e 3341 return REDIS_OK;
3342 }
3343 return REDIS_OK; /* unreached */
3344}
3345
a3b21203 3346static void rdbRemoveTempFile(pid_t childpid) {
3347 char tmpfile[256];
3348
3349 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3350 unlink(tmpfile);
3351}
3352
f78fd11b 3353static int rdbLoadType(FILE *fp) {
3354 unsigned char type;
7b45bfb2 3355 if (fread(&type,1,1,fp) == 0) return -1;
3356 return type;
3357}
3358
bb32ede5 3359static time_t rdbLoadTime(FILE *fp) {
3360 int32_t t32;
3361 if (fread(&t32,4,1,fp) == 0) return -1;
3362 return (time_t) t32;
3363}
3364
e3566d4b 3365/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3366 * of this file for a description of how this are stored on disk.
3367 *
3368 * isencoded is set to 1 if the readed length is not actually a length but
3369 * an "encoding type", check the above comments for more info */
c78a8ccc 3370static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3371 unsigned char buf[2];
3372 uint32_t len;
c78a8ccc 3373 int type;
f78fd11b 3374
e3566d4b 3375 if (isencoded) *isencoded = 0;
c78a8ccc 3376 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3377 type = (buf[0]&0xC0)>>6;
3378 if (type == REDIS_RDB_6BITLEN) {
3379 /* Read a 6 bit len */
3380 return buf[0]&0x3F;
3381 } else if (type == REDIS_RDB_ENCVAL) {
3382 /* Read a 6 bit len encoding type */
3383 if (isencoded) *isencoded = 1;
3384 return buf[0]&0x3F;
3385 } else if (type == REDIS_RDB_14BITLEN) {
3386 /* Read a 14 bit len */
3387 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3388 return ((buf[0]&0x3F)<<8)|buf[1];
3389 } else {
3390 /* Read a 32 bit len */
f78fd11b 3391 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3392 return ntohl(len);
f78fd11b 3393 }
f78fd11b 3394}
3395
e3566d4b 3396static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3397 unsigned char enc[4];
3398 long long val;
3399
3400 if (enctype == REDIS_RDB_ENC_INT8) {
3401 if (fread(enc,1,1,fp) == 0) return NULL;
3402 val = (signed char)enc[0];
3403 } else if (enctype == REDIS_RDB_ENC_INT16) {
3404 uint16_t v;
3405 if (fread(enc,2,1,fp) == 0) return NULL;
3406 v = enc[0]|(enc[1]<<8);
3407 val = (int16_t)v;
3408 } else if (enctype == REDIS_RDB_ENC_INT32) {
3409 uint32_t v;
3410 if (fread(enc,4,1,fp) == 0) return NULL;
3411 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3412 val = (int32_t)v;
3413 } else {
3414 val = 0; /* anti-warning */
dfc5e96c 3415 redisAssert(0!=0);
e3566d4b 3416 }
3417 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3418}
3419
c78a8ccc 3420static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3421 unsigned int len, clen;
3422 unsigned char *c = NULL;
3423 sds val = NULL;
3424
c78a8ccc 3425 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3426 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3427 if ((c = zmalloc(clen)) == NULL) goto err;
3428 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3429 if (fread(c,clen,1,fp) == 0) goto err;
3430 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3431 zfree(c);
88e85998 3432 return createObject(REDIS_STRING,val);
3433err:
3434 zfree(c);
3435 sdsfree(val);
3436 return NULL;
3437}
3438
c78a8ccc 3439static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3440 int isencoded;
3441 uint32_t len;
f78fd11b 3442 sds val;
3443
c78a8ccc 3444 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3445 if (isencoded) {
3446 switch(len) {
3447 case REDIS_RDB_ENC_INT8:
3448 case REDIS_RDB_ENC_INT16:
3449 case REDIS_RDB_ENC_INT32:
3305306f 3450 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
88e85998 3451 case REDIS_RDB_ENC_LZF:
c78a8ccc 3452 return tryObjectSharing(rdbLoadLzfStringObject(fp));
e3566d4b 3453 default:
dfc5e96c 3454 redisAssert(0!=0);
e3566d4b 3455 }
3456 }
3457
f78fd11b 3458 if (len == REDIS_RDB_LENERR) return NULL;
3459 val = sdsnewlen(NULL,len);
3460 if (len && fread(val,len,1,fp) == 0) {
3461 sdsfree(val);
3462 return NULL;
3463 }
10c43610 3464 return tryObjectSharing(createObject(REDIS_STRING,val));
f78fd11b 3465}
3466
a7866db6 3467/* For information about double serialization check rdbSaveDoubleValue() */
3468static int rdbLoadDoubleValue(FILE *fp, double *val) {
3469 char buf[128];
3470 unsigned char len;
3471
3472 if (fread(&len,1,1,fp) == 0) return -1;
3473 switch(len) {
3474 case 255: *val = R_NegInf; return 0;
3475 case 254: *val = R_PosInf; return 0;
3476 case 253: *val = R_Nan; return 0;
3477 default:
3478 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3479 buf[len] = '\0';
a7866db6 3480 sscanf(buf, "%lg", val);
3481 return 0;
3482 }
3483}
3484
c78a8ccc 3485/* Load a Redis object of the specified type from the specified file.
3486 * On success a newly allocated object is returned, otherwise NULL. */
3487static robj *rdbLoadObject(int type, FILE *fp) {
3488 robj *o;
3489
bcd11906 3490 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3491 if (type == REDIS_STRING) {
3492 /* Read string value */
3493 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3494 tryObjectEncoding(o);
3495 } else if (type == REDIS_LIST || type == REDIS_SET) {
3496 /* Read list/set value */
3497 uint32_t listlen;
3498
3499 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3500 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3501 /* It's faster to expand the dict to the right size asap in order
3502 * to avoid rehashing */
3503 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3504 dictExpand(o->ptr,listlen);
c78a8ccc 3505 /* Load every single element of the list/set */
3506 while(listlen--) {
3507 robj *ele;
3508
3509 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3510 tryObjectEncoding(ele);
3511 if (type == REDIS_LIST) {
3512 listAddNodeTail((list*)o->ptr,ele);
3513 } else {
3514 dictAdd((dict*)o->ptr,ele,NULL);
3515 }
3516 }
3517 } else if (type == REDIS_ZSET) {
3518 /* Read list/set value */
ada386b2 3519 size_t zsetlen;
c78a8ccc 3520 zset *zs;
3521
3522 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3523 o = createZsetObject();
3524 zs = o->ptr;
3525 /* Load every single element of the list/set */
3526 while(zsetlen--) {
3527 robj *ele;
3528 double *score = zmalloc(sizeof(double));
3529
3530 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3531 tryObjectEncoding(ele);
3532 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3533 dictAdd(zs->dict,ele,score);
3534 zslInsert(zs->zsl,*score,ele);
3535 incrRefCount(ele); /* added to skiplist */
3536 }
ada386b2 3537 } else if (type == REDIS_HASH) {
3538 size_t hashlen;
3539
3540 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3541 o = createHashObject();
3542 /* Too many entries? Use an hash table. */
3543 if (hashlen > server.hash_max_zipmap_entries)
3544 convertToRealHash(o);
3545 /* Load every key/value, then set it into the zipmap or hash
3546 * table, as needed. */
3547 while(hashlen--) {
3548 robj *key, *val;
3549
3550 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3551 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3552 /* If we are using a zipmap and there are too big values
3553 * the object is converted to real hash table encoding. */
3554 if (o->encoding != REDIS_ENCODING_HT &&
3555 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3556 sdslen(val->ptr) > server.hash_max_zipmap_value))
3557 {
3558 convertToRealHash(o);
3559 }
3560
3561 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3562 unsigned char *zm = o->ptr;
3563
3564 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3565 val->ptr,sdslen(val->ptr),NULL);
3566 o->ptr = zm;
3567 decrRefCount(key);
3568 decrRefCount(val);
3569 } else {
3570 tryObjectEncoding(key);
3571 tryObjectEncoding(val);
3572 dictAdd((dict*)o->ptr,key,val);
3573 incrRefCount(key);
3574 incrRefCount(val);
3575 }
3576 }
c78a8ccc 3577 } else {
3578 redisAssert(0 != 0);
3579 }
3580 return o;
3581}
3582
f78fd11b 3583static int rdbLoad(char *filename) {
ed9b544e 3584 FILE *fp;
f78fd11b 3585 robj *keyobj = NULL;
3586 uint32_t dbid;
bb32ede5 3587 int type, retval, rdbver;
3305306f 3588 dict *d = server.db[0].dict;
bb32ede5 3589 redisDb *db = server.db+0;
f78fd11b 3590 char buf[1024];
bb32ede5 3591 time_t expiretime = -1, now = time(NULL);
b492cf00 3592 long long loadedkeys = 0;
bb32ede5 3593
ed9b544e 3594 fp = fopen(filename,"r");
3595 if (!fp) return REDIS_ERR;
3596 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3597 buf[9] = '\0';
3598 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3599 fclose(fp);
3600 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3601 return REDIS_ERR;
3602 }
f78fd11b 3603 rdbver = atoi(buf+5);
c78a8ccc 3604 if (rdbver != 1) {
f78fd11b 3605 fclose(fp);
3606 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3607 return REDIS_ERR;
3608 }
ed9b544e 3609 while(1) {
3610 robj *o;
3611
3612 /* Read type. */
f78fd11b 3613 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3614 if (type == REDIS_EXPIRETIME) {
3615 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3616 /* We read the time so we need to read the object type again */
3617 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3618 }
ed9b544e 3619 if (type == REDIS_EOF) break;
3620 /* Handle SELECT DB opcode as a special case */
3621 if (type == REDIS_SELECTDB) {
c78a8ccc 3622 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3623 goto eoferr;
ed9b544e 3624 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3625 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3626 exit(1);
3627 }
bb32ede5 3628 db = server.db+dbid;
3629 d = db->dict;
ed9b544e 3630 continue;
3631 }
3632 /* Read key */
c78a8ccc 3633 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3634 /* Read value */
3635 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3636 /* Add the new object in the hash table */
f78fd11b 3637 retval = dictAdd(d,keyobj,o);
ed9b544e 3638 if (retval == DICT_ERR) {
f78fd11b 3639 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3640 exit(1);
3641 }
bb32ede5 3642 /* Set the expire time if needed */
3643 if (expiretime != -1) {
3644 setExpire(db,keyobj,expiretime);
3645 /* Delete this key if already expired */
3646 if (expiretime < now) deleteKey(db,keyobj);
3647 expiretime = -1;
3648 }
f78fd11b 3649 keyobj = o = NULL;
b492cf00 3650 /* Handle swapping while loading big datasets when VM is on */
3651 loadedkeys++;
3652 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3653 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3654 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3655 }
3656 }
ed9b544e 3657 }
3658 fclose(fp);
3659 return REDIS_OK;
3660
3661eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3662 if (keyobj) decrRefCount(keyobj);
f80dff62 3663 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3664 exit(1);
3665 return REDIS_ERR; /* Just to avoid warning */
3666}
3667
3668/*================================== Commands =============================== */
3669
abcb223e 3670static void authCommand(redisClient *c) {
2e77c2ee 3671 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3672 c->authenticated = 1;
3673 addReply(c,shared.ok);
3674 } else {
3675 c->authenticated = 0;
fa4c0aba 3676 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3677 }
3678}
3679
ed9b544e 3680static void pingCommand(redisClient *c) {
3681 addReply(c,shared.pong);
3682}
3683
3684static void echoCommand(redisClient *c) {
942a3961 3685 addReplyBulkLen(c,c->argv[1]);
ed9b544e 3686 addReply(c,c->argv[1]);
3687 addReply(c,shared.crlf);
3688}
3689
3690/*=================================== Strings =============================== */
3691
3692static void setGenericCommand(redisClient *c, int nx) {
3693 int retval;
3694
333fd216 3695 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3696 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3697 if (retval == DICT_ERR) {
3698 if (!nx) {
1b03836c 3699 /* If the key is about a swapped value, we want a new key object
3700 * to overwrite the old. So we delete the old key in the database.
3701 * This will also make sure that swap pages about the old object
3702 * will be marked as free. */
ddfaca9d 3703 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
1b03836c 3704 incrRefCount(c->argv[1]);
3305306f 3705 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3706 incrRefCount(c->argv[2]);
3707 } else {
c937aa89 3708 addReply(c,shared.czero);
ed9b544e 3709 return;
3710 }
3711 } else {
3712 incrRefCount(c->argv[1]);
3713 incrRefCount(c->argv[2]);
3714 }
3715 server.dirty++;
3305306f 3716 removeExpire(c->db,c->argv[1]);
c937aa89 3717 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3718}
3719
3720static void setCommand(redisClient *c) {
a4d1ba9a 3721 setGenericCommand(c,0);
ed9b544e 3722}
3723
3724static void setnxCommand(redisClient *c) {
a4d1ba9a 3725 setGenericCommand(c,1);
ed9b544e 3726}
3727
322fc7d8 3728static int getGenericCommand(redisClient *c) {
3305306f 3729 robj *o = lookupKeyRead(c->db,c->argv[1]);
3730
3731 if (o == NULL) {
c937aa89 3732 addReply(c,shared.nullbulk);
322fc7d8 3733 return REDIS_OK;
ed9b544e 3734 } else {
ed9b544e 3735 if (o->type != REDIS_STRING) {
c937aa89 3736 addReply(c,shared.wrongtypeerr);
322fc7d8 3737 return REDIS_ERR;
ed9b544e 3738 } else {
942a3961 3739 addReplyBulkLen(c,o);
ed9b544e 3740 addReply(c,o);
3741 addReply(c,shared.crlf);
322fc7d8 3742 return REDIS_OK;
ed9b544e 3743 }
3744 }
3745}
3746
322fc7d8 3747static void getCommand(redisClient *c) {
3748 getGenericCommand(c);
3749}
3750
f6b141c5 3751static void getsetCommand(redisClient *c) {
322fc7d8 3752 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3753 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3754 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3755 } else {
3756 incrRefCount(c->argv[1]);
3757 }
3758 incrRefCount(c->argv[2]);
3759 server.dirty++;
3760 removeExpire(c->db,c->argv[1]);
3761}
3762
70003d28 3763static void mgetCommand(redisClient *c) {
70003d28 3764 int j;
3765
c937aa89 3766 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3767 for (j = 1; j < c->argc; j++) {
3305306f 3768 robj *o = lookupKeyRead(c->db,c->argv[j]);
3769 if (o == NULL) {
c937aa89 3770 addReply(c,shared.nullbulk);
70003d28 3771 } else {
70003d28 3772 if (o->type != REDIS_STRING) {
c937aa89 3773 addReply(c,shared.nullbulk);
70003d28 3774 } else {
942a3961 3775 addReplyBulkLen(c,o);
70003d28 3776 addReply(c,o);
3777 addReply(c,shared.crlf);
3778 }
3779 }
3780 }
3781}
3782
6c446631 3783static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3784 int j, busykeys = 0;
6c446631 3785
3786 if ((c->argc % 2) == 0) {
454d4e43 3787 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3788 return;
3789 }
3790 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3791 * set nothing at all if at least one already key exists. */
3792 if (nx) {
3793 for (j = 1; j < c->argc; j += 2) {
906573e7 3794 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3795 busykeys++;
6c446631 3796 }
3797 }
3798 }
906573e7 3799 if (busykeys) {
3800 addReply(c, shared.czero);
3801 return;
3802 }
6c446631 3803
3804 for (j = 1; j < c->argc; j += 2) {
3805 int retval;
3806
17511391 3807 tryObjectEncoding(c->argv[j+1]);
6c446631 3808 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3809 if (retval == DICT_ERR) {
3810 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3811 incrRefCount(c->argv[j+1]);
3812 } else {
3813 incrRefCount(c->argv[j]);
3814 incrRefCount(c->argv[j+1]);
3815 }
3816 removeExpire(c->db,c->argv[j]);
3817 }
3818 server.dirty += (c->argc-1)/2;
3819 addReply(c, nx ? shared.cone : shared.ok);
3820}
3821
3822static void msetCommand(redisClient *c) {
3823 msetGenericCommand(c,0);
3824}
3825
3826static void msetnxCommand(redisClient *c) {
3827 msetGenericCommand(c,1);
3828}
3829
d68ed120 3830static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 3831 long long value;
3832 int retval;
3833 robj *o;
3834
3305306f 3835 o = lookupKeyWrite(c->db,c->argv[1]);
3836 if (o == NULL) {
ed9b544e 3837 value = 0;
3838 } else {
ed9b544e 3839 if (o->type != REDIS_STRING) {
3840 value = 0;
3841 } else {
3842 char *eptr;
3843
942a3961 3844 if (o->encoding == REDIS_ENCODING_RAW)
3845 value = strtoll(o->ptr, &eptr, 10);
3846 else if (o->encoding == REDIS_ENCODING_INT)
3847 value = (long)o->ptr;
3848 else
dfc5e96c 3849 redisAssert(1 != 1);
ed9b544e 3850 }
3851 }
3852
3853 value += incr;
3854 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
942a3961 3855 tryObjectEncoding(o);
3305306f 3856 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 3857 if (retval == DICT_ERR) {
3305306f 3858 dictReplace(c->db->dict,c->argv[1],o);
3859 removeExpire(c->db,c->argv[1]);
ed9b544e 3860 } else {
3861 incrRefCount(c->argv[1]);
3862 }
3863 server.dirty++;
c937aa89 3864 addReply(c,shared.colon);
ed9b544e 3865 addReply(c,o);
3866 addReply(c,shared.crlf);
3867}
3868
3869static void incrCommand(redisClient *c) {
a4d1ba9a 3870 incrDecrCommand(c,1);
ed9b544e 3871}
3872
3873static void decrCommand(redisClient *c) {
a4d1ba9a 3874 incrDecrCommand(c,-1);
ed9b544e 3875}
3876
3877static void incrbyCommand(redisClient *c) {
d68ed120 3878 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3879 incrDecrCommand(c,incr);
ed9b544e 3880}
3881
3882static void decrbyCommand(redisClient *c) {
d68ed120 3883 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3884 incrDecrCommand(c,-incr);
ed9b544e 3885}
3886
4b00bebd 3887static void appendCommand(redisClient *c) {
3888 int retval;
3889 size_t totlen;
3890 robj *o;
3891
3892 o = lookupKeyWrite(c->db,c->argv[1]);
3893 if (o == NULL) {
3894 /* Create the key */
3895 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3896 incrRefCount(c->argv[1]);
3897 incrRefCount(c->argv[2]);
3898 totlen = stringObjectLen(c->argv[2]);
3899 } else {
3900 dictEntry *de;
3901
3902 de = dictFind(c->db->dict,c->argv[1]);
3903 assert(de != NULL);
3904
3905 o = dictGetEntryVal(de);
3906 if (o->type != REDIS_STRING) {
3907 addReply(c,shared.wrongtypeerr);
3908 return;
3909 }
3910 /* If the object is specially encoded or shared we have to make
3911 * a copy */
3912 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3913 robj *decoded = getDecodedObject(o);
3914
3915 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3916 decrRefCount(decoded);
3917 dictReplace(c->db->dict,c->argv[1],o);
3918 }
3919 /* APPEND! */
3920 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3921 o->ptr = sdscatlen(o->ptr,
3922 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3923 } else {
3924 o->ptr = sdscatprintf(o->ptr, "%ld",
3925 (unsigned long) c->argv[2]->ptr);
3926 }
3927 totlen = sdslen(o->ptr);
3928 }
3929 server.dirty++;
3930 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3931}
3932
39191553 3933static void substrCommand(redisClient *c) {
3934 robj *o;
3935 long start = atoi(c->argv[2]->ptr);
3936 long end = atoi(c->argv[3]->ptr);
3937
3938 o = lookupKeyRead(c->db,c->argv[1]);
3939 if (o == NULL) {
3940 addReply(c,shared.nullbulk);
3941 } else {
3942 if (o->type != REDIS_STRING) {
3943 addReply(c,shared.wrongtypeerr);
3944 } else {
8fe7fad7 3945 size_t rangelen, strlen;
39191553 3946 sds range;
3947
8fe7fad7 3948 o = getDecodedObject(o);
3949 strlen = sdslen(o->ptr);
3950
39191553 3951 /* convert negative indexes */
3952 if (start < 0) start = strlen+start;
3953 if (end < 0) end = strlen+end;
3954 if (start < 0) start = 0;
3955 if (end < 0) end = 0;
3956
3957 /* indexes sanity checks */
3958 if (start > end || (size_t)start >= strlen) {
3959 /* Out of range start or start > end result in null reply */
3960 addReply(c,shared.nullbulk);
8fe7fad7 3961 decrRefCount(o);
39191553 3962 return;
3963 }
3964 if ((size_t)end >= strlen) end = strlen-1;
3965 rangelen = (end-start)+1;
3966
3967 /* Return the result */
5de9ad7c 3968 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
39191553 3969 range = sdsnewlen((char*)o->ptr+start,rangelen);
3970 addReplySds(c,range);
3971 addReply(c,shared.crlf);
8fe7fad7 3972 decrRefCount(o);
39191553 3973 }
3974 }
3975}
3976
ed9b544e 3977/* ========================= Type agnostic commands ========================= */
3978
3979static void delCommand(redisClient *c) {
5109cdff 3980 int deleted = 0, j;
3981
3982 for (j = 1; j < c->argc; j++) {
3983 if (deleteKey(c->db,c->argv[j])) {
3984 server.dirty++;
3985 deleted++;
3986 }
3987 }
3988 switch(deleted) {
3989 case 0:
c937aa89 3990 addReply(c,shared.czero);
5109cdff 3991 break;
3992 case 1:
3993 addReply(c,shared.cone);
3994 break;
3995 default:
3996 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3997 break;
ed9b544e 3998 }
3999}
4000
4001static void existsCommand(redisClient *c) {
3305306f 4002 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4003}
4004
4005static void selectCommand(redisClient *c) {
4006 int id = atoi(c->argv[1]->ptr);
4007
4008 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4009 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4010 } else {
4011 addReply(c,shared.ok);
4012 }
4013}
4014
4015static void randomkeyCommand(redisClient *c) {
4016 dictEntry *de;
3305306f 4017
4018 while(1) {
4019 de = dictGetRandomKey(c->db->dict);
ce7bef07 4020 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4021 }
ed9b544e 4022 if (de == NULL) {
ce7bef07 4023 addReply(c,shared.plus);
ed9b544e 4024 addReply(c,shared.crlf);
4025 } else {
c937aa89 4026 addReply(c,shared.plus);
ed9b544e 4027 addReply(c,dictGetEntryKey(de));
4028 addReply(c,shared.crlf);
4029 }
4030}
4031
4032static void keysCommand(redisClient *c) {
4033 dictIterator *di;
4034 dictEntry *de;
4035 sds pattern = c->argv[1]->ptr;
4036 int plen = sdslen(pattern);
a3f9eec2 4037 unsigned long numkeys = 0;
ed9b544e 4038 robj *lenobj = createObject(REDIS_STRING,NULL);
4039
3305306f 4040 di = dictGetIterator(c->db->dict);
ed9b544e 4041 addReply(c,lenobj);
4042 decrRefCount(lenobj);
4043 while((de = dictNext(di)) != NULL) {
4044 robj *keyobj = dictGetEntryKey(de);
3305306f 4045
ed9b544e 4046 sds key = keyobj->ptr;
4047 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4048 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4049 if (expireIfNeeded(c->db,keyobj) == 0) {
a3f9eec2 4050 addReplyBulkLen(c,keyobj);
3305306f 4051 addReply(c,keyobj);
a3f9eec2 4052 addReply(c,shared.crlf);
3305306f 4053 numkeys++;
3305306f 4054 }
ed9b544e 4055 }
4056 }
4057 dictReleaseIterator(di);
a3f9eec2 4058 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4059}
4060
4061static void dbsizeCommand(redisClient *c) {
4062 addReplySds(c,
3305306f 4063 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4064}
4065
4066static void lastsaveCommand(redisClient *c) {
4067 addReplySds(c,
c937aa89 4068 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4069}
4070
4071static void typeCommand(redisClient *c) {
3305306f 4072 robj *o;
ed9b544e 4073 char *type;
3305306f 4074
4075 o = lookupKeyRead(c->db,c->argv[1]);
4076 if (o == NULL) {
c937aa89 4077 type = "+none";
ed9b544e 4078 } else {
ed9b544e 4079 switch(o->type) {
c937aa89 4080 case REDIS_STRING: type = "+string"; break;
4081 case REDIS_LIST: type = "+list"; break;
4082 case REDIS_SET: type = "+set"; break;
412a8bce 4083 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4084 case REDIS_HASH: type = "+hash"; break;
4085 default: type = "+unknown"; break;
ed9b544e 4086 }
4087 }
4088 addReplySds(c,sdsnew(type));
4089 addReply(c,shared.crlf);
4090}
4091
4092static void saveCommand(redisClient *c) {
9d65a1bb 4093 if (server.bgsavechildpid != -1) {
05557f6d 4094 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4095 return;
4096 }
f78fd11b 4097 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4098 addReply(c,shared.ok);
4099 } else {
4100 addReply(c,shared.err);
4101 }
4102}
4103
4104static void bgsaveCommand(redisClient *c) {
9d65a1bb 4105 if (server.bgsavechildpid != -1) {
ed9b544e 4106 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4107 return;
4108 }
f78fd11b 4109 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4110 char *status = "+Background saving started\r\n";
4111 addReplySds(c,sdsnew(status));
ed9b544e 4112 } else {
4113 addReply(c,shared.err);
4114 }
4115}
4116
4117static void shutdownCommand(redisClient *c) {
4118 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4119 /* Kill the saving child if there is a background saving in progress.
4120 We want to avoid race conditions, for instance our saving child may
4121 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4122 if (server.bgsavechildpid != -1) {
9f3c422c 4123 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4124 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4125 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4126 }
ac945e2d 4127 if (server.appendonly) {
4128 /* Append only file: fsync() the AOF and exit */
4129 fsync(server.appendfd);
054e426d 4130 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4131 exit(0);
ed9b544e 4132 } else {
ac945e2d 4133 /* Snapshotting. Perform a SYNC SAVE and exit */
4134 if (rdbSave(server.dbfilename) == REDIS_OK) {
4135 if (server.daemonize)
4136 unlink(server.pidfile);
4137 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4138 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 4139 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4140 exit(0);
4141 } else {
4142 /* Ooops.. error saving! The best we can do is to continue operating.
4143 * Note that if there was a background saving process, in the next
4144 * cron() Redis will be notified that the background saving aborted,
4145 * handling special stuff like slaves pending for synchronization... */
4146 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4147 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4148 }
ed9b544e 4149 }
4150}
4151
4152static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4153 robj *o;
4154
4155 /* To use the same key as src and dst is probably an error */
4156 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4157 addReply(c,shared.sameobjecterr);
ed9b544e 4158 return;
4159 }
4160
3305306f 4161 o = lookupKeyWrite(c->db,c->argv[1]);
4162 if (o == NULL) {
c937aa89 4163 addReply(c,shared.nokeyerr);
ed9b544e 4164 return;
4165 }
ed9b544e 4166 incrRefCount(o);
3305306f 4167 deleteIfVolatile(c->db,c->argv[2]);
4168 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4169 if (nx) {
4170 decrRefCount(o);
c937aa89 4171 addReply(c,shared.czero);
ed9b544e 4172 return;
4173 }
3305306f 4174 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4175 } else {
4176 incrRefCount(c->argv[2]);
4177 }
3305306f 4178 deleteKey(c->db,c->argv[1]);
ed9b544e 4179 server.dirty++;
c937aa89 4180 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4181}
4182
4183static void renameCommand(redisClient *c) {
4184 renameGenericCommand(c,0);
4185}
4186
4187static void renamenxCommand(redisClient *c) {
4188 renameGenericCommand(c,1);
4189}
4190
4191static void moveCommand(redisClient *c) {
3305306f 4192 robj *o;
4193 redisDb *src, *dst;
ed9b544e 4194 int srcid;
4195
4196 /* Obtain source and target DB pointers */
3305306f 4197 src = c->db;
4198 srcid = c->db->id;
ed9b544e 4199 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4200 addReply(c,shared.outofrangeerr);
ed9b544e 4201 return;
4202 }
3305306f 4203 dst = c->db;
4204 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4205
4206 /* If the user is moving using as target the same
4207 * DB as the source DB it is probably an error. */
4208 if (src == dst) {
c937aa89 4209 addReply(c,shared.sameobjecterr);
ed9b544e 4210 return;
4211 }
4212
4213 /* Check if the element exists and get a reference */
3305306f 4214 o = lookupKeyWrite(c->db,c->argv[1]);
4215 if (!o) {
c937aa89 4216 addReply(c,shared.czero);
ed9b544e 4217 return;
4218 }
4219
4220 /* Try to add the element to the target DB */
3305306f 4221 deleteIfVolatile(dst,c->argv[1]);
4222 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4223 addReply(c,shared.czero);
ed9b544e 4224 return;
4225 }
3305306f 4226 incrRefCount(c->argv[1]);
ed9b544e 4227 incrRefCount(o);
4228
4229 /* OK! key moved, free the entry in the source DB */
3305306f 4230 deleteKey(src,c->argv[1]);
ed9b544e 4231 server.dirty++;
c937aa89 4232 addReply(c,shared.cone);
ed9b544e 4233}
4234
4235/* =================================== Lists ================================ */
4236static void pushGenericCommand(redisClient *c, int where) {
4237 robj *lobj;
ed9b544e 4238 list *list;
3305306f 4239
4240 lobj = lookupKeyWrite(c->db,c->argv[1]);
4241 if (lobj == NULL) {
95242ab5 4242 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4243 addReply(c,shared.cone);
95242ab5 4244 return;
4245 }
ed9b544e 4246 lobj = createListObject();
4247 list = lobj->ptr;
4248 if (where == REDIS_HEAD) {
6b47e12e 4249 listAddNodeHead(list,c->argv[2]);
ed9b544e 4250 } else {
6b47e12e 4251 listAddNodeTail(list,c->argv[2]);
ed9b544e 4252 }
3305306f 4253 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4254 incrRefCount(c->argv[1]);
4255 incrRefCount(c->argv[2]);
4256 } else {
ed9b544e 4257 if (lobj->type != REDIS_LIST) {
4258 addReply(c,shared.wrongtypeerr);
4259 return;
4260 }
95242ab5 4261 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4262 addReply(c,shared.cone);
95242ab5 4263 return;
4264 }
ed9b544e 4265 list = lobj->ptr;
4266 if (where == REDIS_HEAD) {
6b47e12e 4267 listAddNodeHead(list,c->argv[2]);
ed9b544e 4268 } else {
6b47e12e 4269 listAddNodeTail(list,c->argv[2]);
ed9b544e 4270 }
4271 incrRefCount(c->argv[2]);
4272 }
4273 server.dirty++;
520b5a33 4274 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4275}
4276
4277static void lpushCommand(redisClient *c) {
4278 pushGenericCommand(c,REDIS_HEAD);
4279}
4280
4281static void rpushCommand(redisClient *c) {
4282 pushGenericCommand(c,REDIS_TAIL);
4283}
4284
4285static void llenCommand(redisClient *c) {
3305306f 4286 robj *o;
ed9b544e 4287 list *l;
4288
3305306f 4289 o = lookupKeyRead(c->db,c->argv[1]);
4290 if (o == NULL) {
c937aa89 4291 addReply(c,shared.czero);
ed9b544e 4292 return;
4293 } else {
ed9b544e 4294 if (o->type != REDIS_LIST) {
c937aa89 4295 addReply(c,shared.wrongtypeerr);
ed9b544e 4296 } else {
4297 l = o->ptr;
c937aa89 4298 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
ed9b544e 4299 }
4300 }
4301}
4302
4303static void lindexCommand(redisClient *c) {
3305306f 4304 robj *o;
ed9b544e 4305 int index = atoi(c->argv[2]->ptr);
4306
3305306f 4307 o = lookupKeyRead(c->db,c->argv[1]);
4308 if (o == NULL) {
c937aa89 4309 addReply(c,shared.nullbulk);
ed9b544e 4310 } else {
ed9b544e 4311 if (o->type != REDIS_LIST) {
c937aa89 4312 addReply(c,shared.wrongtypeerr);
ed9b544e 4313 } else {
4314 list *list = o->ptr;
4315 listNode *ln;
4316
4317 ln = listIndex(list, index);
4318 if (ln == NULL) {
c937aa89 4319 addReply(c,shared.nullbulk);
ed9b544e 4320 } else {
4321 robj *ele = listNodeValue(ln);
942a3961 4322 addReplyBulkLen(c,ele);
ed9b544e 4323 addReply(c,ele);
4324 addReply(c,shared.crlf);
4325 }
4326 }
4327 }
4328}
4329
4330static void lsetCommand(redisClient *c) {
3305306f 4331 robj *o;
ed9b544e 4332 int index = atoi(c->argv[2]->ptr);
4333
3305306f 4334 o = lookupKeyWrite(c->db,c->argv[1]);
4335 if (o == NULL) {
ed9b544e 4336 addReply(c,shared.nokeyerr);
4337 } else {
ed9b544e 4338 if (o->type != REDIS_LIST) {
4339 addReply(c,shared.wrongtypeerr);
4340 } else {
4341 list *list = o->ptr;
4342 listNode *ln;
4343
4344 ln = listIndex(list, index);
4345 if (ln == NULL) {
c937aa89 4346 addReply(c,shared.outofrangeerr);
ed9b544e 4347 } else {
4348 robj *ele = listNodeValue(ln);
4349
4350 decrRefCount(ele);
4351 listNodeValue(ln) = c->argv[3];
4352 incrRefCount(c->argv[3]);
4353 addReply(c,shared.ok);
4354 server.dirty++;
4355 }
4356 }
4357 }
4358}
4359
4360static void popGenericCommand(redisClient *c, int where) {
3305306f 4361 robj *o;
4362
4363 o = lookupKeyWrite(c->db,c->argv[1]);
4364 if (o == NULL) {
c937aa89 4365 addReply(c,shared.nullbulk);
ed9b544e 4366 } else {
ed9b544e 4367 if (o->type != REDIS_LIST) {
c937aa89 4368 addReply(c,shared.wrongtypeerr);
ed9b544e 4369 } else {
4370 list *list = o->ptr;
4371 listNode *ln;
4372
4373 if (where == REDIS_HEAD)
4374 ln = listFirst(list);
4375 else
4376 ln = listLast(list);
4377
4378 if (ln == NULL) {
c937aa89 4379 addReply(c,shared.nullbulk);
ed9b544e 4380 } else {
4381 robj *ele = listNodeValue(ln);
942a3961 4382 addReplyBulkLen(c,ele);
ed9b544e 4383 addReply(c,ele);
4384 addReply(c,shared.crlf);
4385 listDelNode(list,ln);
4386 server.dirty++;
4387 }
4388 }
4389 }
4390}
4391
4392static void lpopCommand(redisClient *c) {
4393 popGenericCommand(c,REDIS_HEAD);
4394}
4395
4396static void rpopCommand(redisClient *c) {
4397 popGenericCommand(c,REDIS_TAIL);
4398}
4399
4400static void lrangeCommand(redisClient *c) {
3305306f 4401 robj *o;
ed9b544e 4402 int start = atoi(c->argv[2]->ptr);
4403 int end = atoi(c->argv[3]->ptr);
3305306f 4404
4405 o = lookupKeyRead(c->db,c->argv[1]);
4406 if (o == NULL) {
c937aa89 4407 addReply(c,shared.nullmultibulk);
ed9b544e 4408 } else {
ed9b544e 4409 if (o->type != REDIS_LIST) {
c937aa89 4410 addReply(c,shared.wrongtypeerr);
ed9b544e 4411 } else {
4412 list *list = o->ptr;
4413 listNode *ln;
4414 int llen = listLength(list);
4415 int rangelen, j;
4416 robj *ele;
4417
4418 /* convert negative indexes */
4419 if (start < 0) start = llen+start;
4420 if (end < 0) end = llen+end;
4421 if (start < 0) start = 0;
4422 if (end < 0) end = 0;
4423
4424 /* indexes sanity checks */
4425 if (start > end || start >= llen) {
4426 /* Out of range start or start > end result in empty list */
c937aa89 4427 addReply(c,shared.emptymultibulk);
ed9b544e 4428 return;
4429 }
4430 if (end >= llen) end = llen-1;
4431 rangelen = (end-start)+1;
4432
4433 /* Return the result in form of a multi-bulk reply */
4434 ln = listIndex(list, start);
c937aa89 4435 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
ed9b544e 4436 for (j = 0; j < rangelen; j++) {
4437 ele = listNodeValue(ln);
942a3961 4438 addReplyBulkLen(c,ele);
ed9b544e 4439 addReply(c,ele);
4440 addReply(c,shared.crlf);
4441 ln = ln->next;
4442 }
4443 }
4444 }
4445}
4446
4447static void ltrimCommand(redisClient *c) {
3305306f 4448 robj *o;
ed9b544e 4449 int start = atoi(c->argv[2]->ptr);
4450 int end = atoi(c->argv[3]->ptr);
4451
3305306f 4452 o = lookupKeyWrite(c->db,c->argv[1]);
4453 if (o == NULL) {
ab9d4cb1 4454 addReply(c,shared.ok);
ed9b544e 4455 } else {
ed9b544e 4456 if (o->type != REDIS_LIST) {
4457 addReply(c,shared.wrongtypeerr);
4458 } else {
4459 list *list = o->ptr;
4460 listNode *ln;
4461 int llen = listLength(list);
4462 int j, ltrim, rtrim;
4463
4464 /* convert negative indexes */
4465 if (start < 0) start = llen+start;
4466 if (end < 0) end = llen+end;
4467 if (start < 0) start = 0;
4468 if (end < 0) end = 0;
4469
4470 /* indexes sanity checks */
4471 if (start > end || start >= llen) {
4472 /* Out of range start or start > end result in empty list */
4473 ltrim = llen;
4474 rtrim = 0;
4475 } else {
4476 if (end >= llen) end = llen-1;
4477 ltrim = start;
4478 rtrim = llen-end-1;
4479 }
4480
4481 /* Remove list elements to perform the trim */
4482 for (j = 0; j < ltrim; j++) {
4483 ln = listFirst(list);
4484 listDelNode(list,ln);
4485 }
4486 for (j = 0; j < rtrim; j++) {
4487 ln = listLast(list);
4488 listDelNode(list,ln);
4489 }
ed9b544e 4490 server.dirty++;
e59229a2 4491 addReply(c,shared.ok);
ed9b544e 4492 }
4493 }
4494}
4495
4496static void lremCommand(redisClient *c) {
3305306f 4497 robj *o;
ed9b544e 4498
3305306f 4499 o = lookupKeyWrite(c->db,c->argv[1]);
4500 if (o == NULL) {
33c08b39 4501 addReply(c,shared.czero);
ed9b544e 4502 } else {
ed9b544e 4503 if (o->type != REDIS_LIST) {
c937aa89 4504 addReply(c,shared.wrongtypeerr);
ed9b544e 4505 } else {
4506 list *list = o->ptr;
4507 listNode *ln, *next;
4508 int toremove = atoi(c->argv[2]->ptr);
4509 int removed = 0;
4510 int fromtail = 0;
4511
4512 if (toremove < 0) {
4513 toremove = -toremove;
4514 fromtail = 1;
4515 }
4516 ln = fromtail ? list->tail : list->head;
4517 while (ln) {
ed9b544e 4518 robj *ele = listNodeValue(ln);
a4d1ba9a 4519
4520 next = fromtail ? ln->prev : ln->next;
724a51b1 4521 if (compareStringObjects(ele,c->argv[3]) == 0) {
ed9b544e 4522 listDelNode(list,ln);
4523 server.dirty++;
4524 removed++;
4525 if (toremove && removed == toremove) break;
4526 }
4527 ln = next;
4528 }
c937aa89 4529 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4530 }
4531 }
4532}
4533
12f9d551 4534/* This is the semantic of this command:
0f5f7e9a 4535 * RPOPLPUSH srclist dstlist:
12f9d551 4536 * IF LLEN(srclist) > 0
4537 * element = RPOP srclist
4538 * LPUSH dstlist element
4539 * RETURN element
4540 * ELSE
4541 * RETURN nil
4542 * END
4543 * END
4544 *
4545 * The idea is to be able to get an element from a list in a reliable way
4546 * since the element is not just returned but pushed against another list
4547 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4548 */
0f5f7e9a 4549static void rpoplpushcommand(redisClient *c) {
12f9d551 4550 robj *sobj;
4551
4552 sobj = lookupKeyWrite(c->db,c->argv[1]);
4553 if (sobj == NULL) {
4554 addReply(c,shared.nullbulk);
4555 } else {
4556 if (sobj->type != REDIS_LIST) {
4557 addReply(c,shared.wrongtypeerr);
4558 } else {
4559 list *srclist = sobj->ptr;
4560 listNode *ln = listLast(srclist);
4561
4562 if (ln == NULL) {
4563 addReply(c,shared.nullbulk);
4564 } else {
4565 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4566 robj *ele = listNodeValue(ln);
4567 list *dstlist;
4568
e20fb74f 4569 if (dobj && dobj->type != REDIS_LIST) {
12f9d551 4570 addReply(c,shared.wrongtypeerr);
4571 return;
4572 }
e20fb74f 4573
4574 /* Add the element to the target list (unless it's directly
4575 * passed to some BLPOP-ing client */
4576 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4577 if (dobj == NULL) {
4578 /* Create the list if the key does not exist */
4579 dobj = createListObject();
4580 dictAdd(c->db->dict,c->argv[2],dobj);
4581 incrRefCount(c->argv[2]);
4582 }
4583 dstlist = dobj->ptr;
4584 listAddNodeHead(dstlist,ele);
4585 incrRefCount(ele);
4586 }
12f9d551 4587
4588 /* Send the element to the client as reply as well */
4589 addReplyBulkLen(c,ele);
4590 addReply(c,ele);
4591 addReply(c,shared.crlf);
4592
4593 /* Finally remove the element from the source list */
4594 listDelNode(srclist,ln);
4595 server.dirty++;
4596 }
4597 }
4598 }
4599}
4600
4601
ed9b544e 4602/* ==================================== Sets ================================ */
4603
4604static void saddCommand(redisClient *c) {
ed9b544e 4605 robj *set;
4606
3305306f 4607 set = lookupKeyWrite(c->db,c->argv[1]);
4608 if (set == NULL) {
ed9b544e 4609 set = createSetObject();
3305306f 4610 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4611 incrRefCount(c->argv[1]);
4612 } else {
ed9b544e 4613 if (set->type != REDIS_SET) {
c937aa89 4614 addReply(c,shared.wrongtypeerr);
ed9b544e 4615 return;
4616 }
4617 }
4618 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4619 incrRefCount(c->argv[2]);
4620 server.dirty++;
c937aa89 4621 addReply(c,shared.cone);
ed9b544e 4622 } else {
c937aa89 4623 addReply(c,shared.czero);
ed9b544e 4624 }
4625}
4626
4627static void sremCommand(redisClient *c) {
3305306f 4628 robj *set;
ed9b544e 4629
3305306f 4630 set = lookupKeyWrite(c->db,c->argv[1]);
4631 if (set == NULL) {
c937aa89 4632 addReply(c,shared.czero);
ed9b544e 4633 } else {
ed9b544e 4634 if (set->type != REDIS_SET) {
c937aa89 4635 addReply(c,shared.wrongtypeerr);
ed9b544e 4636 return;
4637 }
4638 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4639 server.dirty++;
12fea928 4640 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
c937aa89 4641 addReply(c,shared.cone);
ed9b544e 4642 } else {
c937aa89 4643 addReply(c,shared.czero);
ed9b544e 4644 }
4645 }
4646}
4647
a4460ef4 4648static void smoveCommand(redisClient *c) {
4649 robj *srcset, *dstset;
4650
4651 srcset = lookupKeyWrite(c->db,c->argv[1]);
4652 dstset = lookupKeyWrite(c->db,c->argv[2]);
4653
4654 /* If the source key does not exist return 0, if it's of the wrong type
4655 * raise an error */
4656 if (srcset == NULL || srcset->type != REDIS_SET) {
4657 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4658 return;
4659 }
4660 /* Error if the destination key is not a set as well */
4661 if (dstset && dstset->type != REDIS_SET) {
4662 addReply(c,shared.wrongtypeerr);
4663 return;
4664 }
4665 /* Remove the element from the source set */
4666 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4667 /* Key not found in the src set! return zero */
4668 addReply(c,shared.czero);
4669 return;
4670 }
4671 server.dirty++;
4672 /* Add the element to the destination set */
4673 if (!dstset) {
4674 dstset = createSetObject();
4675 dictAdd(c->db->dict,c->argv[2],dstset);
4676 incrRefCount(c->argv[2]);
4677 }
4678 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4679 incrRefCount(c->argv[3]);
4680 addReply(c,shared.cone);
4681}
4682
ed9b544e 4683static void sismemberCommand(redisClient *c) {
3305306f 4684 robj *set;
ed9b544e 4685
3305306f 4686 set = lookupKeyRead(c->db,c->argv[1]);
4687 if (set == NULL) {
c937aa89 4688 addReply(c,shared.czero);
ed9b544e 4689 } else {
ed9b544e 4690 if (set->type != REDIS_SET) {
c937aa89 4691 addReply(c,shared.wrongtypeerr);
ed9b544e 4692 return;
4693 }
4694 if (dictFind(set->ptr,c->argv[2]))
c937aa89 4695 addReply(c,shared.cone);
ed9b544e 4696 else
c937aa89 4697 addReply(c,shared.czero);
ed9b544e 4698 }
4699}
4700
4701static void scardCommand(redisClient *c) {
3305306f 4702 robj *o;
ed9b544e 4703 dict *s;
4704
3305306f 4705 o = lookupKeyRead(c->db,c->argv[1]);
4706 if (o == NULL) {
c937aa89 4707 addReply(c,shared.czero);
ed9b544e 4708 return;
4709 } else {
ed9b544e 4710 if (o->type != REDIS_SET) {
c937aa89 4711 addReply(c,shared.wrongtypeerr);
ed9b544e 4712 } else {
4713 s = o->ptr;
682ac724 4714 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
3305306f 4715 dictSize(s)));
ed9b544e 4716 }
4717 }
4718}
4719
12fea928 4720static void spopCommand(redisClient *c) {
4721 robj *set;
4722 dictEntry *de;
4723
4724 set = lookupKeyWrite(c->db,c->argv[1]);
4725 if (set == NULL) {
4726 addReply(c,shared.nullbulk);
4727 } else {
4728 if (set->type != REDIS_SET) {
4729 addReply(c,shared.wrongtypeerr);
4730 return;
4731 }
4732 de = dictGetRandomKey(set->ptr);
4733 if (de == NULL) {
4734 addReply(c,shared.nullbulk);
4735 } else {
4736 robj *ele = dictGetEntryKey(de);
4737
942a3961 4738 addReplyBulkLen(c,ele);
12fea928 4739 addReply(c,ele);
4740 addReply(c,shared.crlf);
4741 dictDelete(set->ptr,ele);
4742 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4743 server.dirty++;
4744 }
4745 }
4746}
4747
2abb95a9 4748static void srandmemberCommand(redisClient *c) {
4749 robj *set;
4750 dictEntry *de;
4751
4752 set = lookupKeyRead(c->db,c->argv[1]);
4753 if (set == NULL) {
4754 addReply(c,shared.nullbulk);
4755 } else {
4756 if (set->type != REDIS_SET) {
4757 addReply(c,shared.wrongtypeerr);
4758 return;
4759 }
4760 de = dictGetRandomKey(set->ptr);
4761 if (de == NULL) {
4762 addReply(c,shared.nullbulk);
4763 } else {
4764 robj *ele = dictGetEntryKey(de);
4765
4766 addReplyBulkLen(c,ele);
4767 addReply(c,ele);
4768 addReply(c,shared.crlf);
4769 }
4770 }
4771}
4772
ed9b544e 4773static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4774 dict **d1 = (void*) s1, **d2 = (void*) s2;
4775
3305306f 4776 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4777}
4778
682ac724 4779static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4780 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4781 dictIterator *di;
4782 dictEntry *de;
4783 robj *lenobj = NULL, *dstset = NULL;
682ac724 4784 unsigned long j, cardinality = 0;
ed9b544e 4785
ed9b544e 4786 for (j = 0; j < setsnum; j++) {
4787 robj *setobj;
3305306f 4788
4789 setobj = dstkey ?
4790 lookupKeyWrite(c->db,setskeys[j]) :
4791 lookupKeyRead(c->db,setskeys[j]);
4792 if (!setobj) {
ed9b544e 4793 zfree(dv);
5faa6025 4794 if (dstkey) {
fdcaae84 4795 if (deleteKey(c->db,dstkey))
4796 server.dirty++;
0d36ded0 4797 addReply(c,shared.czero);
5faa6025 4798 } else {
4799 addReply(c,shared.nullmultibulk);
4800 }
ed9b544e 4801 return;
4802 }
ed9b544e 4803 if (setobj->type != REDIS_SET) {
4804 zfree(dv);
c937aa89 4805 addReply(c,shared.wrongtypeerr);
ed9b544e 4806 return;
4807 }
4808 dv[j] = setobj->ptr;
4809 }
4810 /* Sort sets from the smallest to largest, this will improve our
4811 * algorithm's performace */
4812 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4813
4814 /* The first thing we should output is the total number of elements...
4815 * since this is a multi-bulk write, but at this stage we don't know
4816 * the intersection set size, so we use a trick, append an empty object
4817 * to the output list and save the pointer to later modify it with the
4818 * right length */
4819 if (!dstkey) {
4820 lenobj = createObject(REDIS_STRING,NULL);
4821 addReply(c,lenobj);
4822 decrRefCount(lenobj);
4823 } else {
4824 /* If we have a target key where to store the resulting set
4825 * create this key with an empty set inside */
4826 dstset = createSetObject();
ed9b544e 4827 }
4828
4829 /* Iterate all the elements of the first (smallest) set, and test
4830 * the element against all the other sets, if at least one set does
4831 * not include the element it is discarded */
4832 di = dictGetIterator(dv[0]);
ed9b544e 4833
4834 while((de = dictNext(di)) != NULL) {
4835 robj *ele;
4836
4837 for (j = 1; j < setsnum; j++)
4838 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4839 if (j != setsnum)
4840 continue; /* at least one set does not contain the member */
4841 ele = dictGetEntryKey(de);
4842 if (!dstkey) {
942a3961 4843 addReplyBulkLen(c,ele);
ed9b544e 4844 addReply(c,ele);
4845 addReply(c,shared.crlf);
4846 cardinality++;
4847 } else {
4848 dictAdd(dstset->ptr,ele,NULL);
4849 incrRefCount(ele);
4850 }
4851 }
4852 dictReleaseIterator(di);
4853
83cdfe18
AG
4854 if (dstkey) {
4855 /* Store the resulting set into the target */
4856 deleteKey(c->db,dstkey);
4857 dictAdd(c->db->dict,dstkey,dstset);
4858 incrRefCount(dstkey);
4859 }
4860
40d224a9 4861 if (!dstkey) {
682ac724 4862 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4863 } else {
682ac724 4864 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4865 dictSize((dict*)dstset->ptr)));
40d224a9 4866 server.dirty++;
4867 }
ed9b544e 4868 zfree(dv);
4869}
4870
4871static void sinterCommand(redisClient *c) {
4872 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4873}
4874
4875static void sinterstoreCommand(redisClient *c) {
4876 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4877}
4878
f4f56e1d 4879#define REDIS_OP_UNION 0
4880#define REDIS_OP_DIFF 1
2830ca53 4881#define REDIS_OP_INTER 2
f4f56e1d 4882
4883static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4884 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4885 dictIterator *di;
4886 dictEntry *de;
f4f56e1d 4887 robj *dstset = NULL;
40d224a9 4888 int j, cardinality = 0;
4889
40d224a9 4890 for (j = 0; j < setsnum; j++) {
4891 robj *setobj;
4892
4893 setobj = dstkey ?
4894 lookupKeyWrite(c->db,setskeys[j]) :
4895 lookupKeyRead(c->db,setskeys[j]);
4896 if (!setobj) {
4897 dv[j] = NULL;
4898 continue;
4899 }
4900 if (setobj->type != REDIS_SET) {
4901 zfree(dv);
4902 addReply(c,shared.wrongtypeerr);
4903 return;
4904 }
4905 dv[j] = setobj->ptr;
4906 }
4907
4908 /* We need a temp set object to store our union. If the dstkey
4909 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4910 * this set object will be the resulting object to set into the target key*/
4911 dstset = createSetObject();
4912
40d224a9 4913 /* Iterate all the elements of all the sets, add every element a single
4914 * time to the result set */
4915 for (j = 0; j < setsnum; j++) {
51829ed3 4916 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4917 if (!dv[j]) continue; /* non existing keys are like empty sets */
4918
4919 di = dictGetIterator(dv[j]);
40d224a9 4920
4921 while((de = dictNext(di)) != NULL) {
4922 robj *ele;
4923
4924 /* dictAdd will not add the same element multiple times */
4925 ele = dictGetEntryKey(de);
f4f56e1d 4926 if (op == REDIS_OP_UNION || j == 0) {
4927 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4928 incrRefCount(ele);
40d224a9 4929 cardinality++;
4930 }
f4f56e1d 4931 } else if (op == REDIS_OP_DIFF) {
4932 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4933 cardinality--;
4934 }
40d224a9 4935 }
4936 }
4937 dictReleaseIterator(di);
51829ed3
AG
4938
4939 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
40d224a9 4940 }
4941
f4f56e1d 4942 /* Output the content of the resulting set, if not in STORE mode */
4943 if (!dstkey) {
4944 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4945 di = dictGetIterator(dstset->ptr);
f4f56e1d 4946 while((de = dictNext(di)) != NULL) {
4947 robj *ele;
4948
4949 ele = dictGetEntryKey(de);
942a3961 4950 addReplyBulkLen(c,ele);
f4f56e1d 4951 addReply(c,ele);
4952 addReply(c,shared.crlf);
4953 }
4954 dictReleaseIterator(di);
83cdfe18
AG
4955 } else {
4956 /* If we have a target key where to store the resulting set
4957 * create this key with the result set inside */
4958 deleteKey(c->db,dstkey);
4959 dictAdd(c->db->dict,dstkey,dstset);
4960 incrRefCount(dstkey);
f4f56e1d 4961 }
4962
4963 /* Cleanup */
40d224a9 4964 if (!dstkey) {
40d224a9 4965 decrRefCount(dstset);
4966 } else {
682ac724 4967 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4968 dictSize((dict*)dstset->ptr)));
40d224a9 4969 server.dirty++;
4970 }
4971 zfree(dv);
4972}
4973
4974static void sunionCommand(redisClient *c) {
f4f56e1d 4975 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 4976}
4977
4978static void sunionstoreCommand(redisClient *c) {
f4f56e1d 4979 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4980}
4981
4982static void sdiffCommand(redisClient *c) {
4983 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4984}
4985
4986static void sdiffstoreCommand(redisClient *c) {
4987 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 4988}
4989
6b47e12e 4990/* ==================================== ZSets =============================== */
4991
4992/* ZSETs are ordered sets using two data structures to hold the same elements
4993 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4994 * data structure.
4995 *
4996 * The elements are added to an hash table mapping Redis objects to scores.
4997 * At the same time the elements are added to a skip list mapping scores
4998 * to Redis objects (so objects are sorted by scores in this "view"). */
4999
5000/* This skiplist implementation is almost a C translation of the original
5001 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5002 * Alternative to Balanced Trees", modified in three ways:
5003 * a) this implementation allows for repeated values.
5004 * b) the comparison is not just by key (our 'score') but by satellite data.
5005 * c) there is a back pointer, so it's a doubly linked list with the back
5006 * pointers being only at "level 1". This allows to traverse the list
5007 * from tail to head, useful for ZREVRANGE. */
5008
5009static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5010 zskiplistNode *zn = zmalloc(sizeof(*zn));
5011
5012 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5013 if (level > 0)
5014 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5015 zn->score = score;
5016 zn->obj = obj;
5017 return zn;
5018}
5019
5020static zskiplist *zslCreate(void) {
5021 int j;
5022 zskiplist *zsl;
5023
5024 zsl = zmalloc(sizeof(*zsl));
5025 zsl->level = 1;
cc812361 5026 zsl->length = 0;
6b47e12e 5027 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5028 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5029 zsl->header->forward[j] = NULL;
94e543b5 5030
5031 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5032 if (j < ZSKIPLIST_MAXLEVEL-1)
5033 zsl->header->span[j] = 0;
69d95c3e 5034 }
e3870fab 5035 zsl->header->backward = NULL;
5036 zsl->tail = NULL;
6b47e12e 5037 return zsl;
5038}
5039
fd8ccf44 5040static void zslFreeNode(zskiplistNode *node) {
5041 decrRefCount(node->obj);
ad807e6f 5042 zfree(node->forward);
69d95c3e 5043 zfree(node->span);
fd8ccf44 5044 zfree(node);
5045}
5046
5047static void zslFree(zskiplist *zsl) {
ad807e6f 5048 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5049
ad807e6f 5050 zfree(zsl->header->forward);
69d95c3e 5051 zfree(zsl->header->span);
ad807e6f 5052 zfree(zsl->header);
fd8ccf44 5053 while(node) {
599379dd 5054 next = node->forward[0];
fd8ccf44 5055 zslFreeNode(node);
5056 node = next;
5057 }
ad807e6f 5058 zfree(zsl);
fd8ccf44 5059}
5060
6b47e12e 5061static int zslRandomLevel(void) {
5062 int level = 1;
5063 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5064 level += 1;
5065 return level;
5066}
5067
5068static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5069 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5070 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5071 int i, level;
5072
5073 x = zsl->header;
5074 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5075 /* store rank that is crossed to reach the insert position */
5076 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5077
9d60e6e4 5078 while (x->forward[i] &&
5079 (x->forward[i]->score < score ||
5080 (x->forward[i]->score == score &&
69d95c3e 5081 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5082 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5083 x = x->forward[i];
69d95c3e 5084 }
6b47e12e 5085 update[i] = x;
5086 }
6b47e12e 5087 /* we assume the key is not already inside, since we allow duplicated
5088 * scores, and the re-insertion of score and redis object should never
5089 * happpen since the caller of zslInsert() should test in the hash table
5090 * if the element is already inside or not. */
5091 level = zslRandomLevel();
5092 if (level > zsl->level) {
69d95c3e 5093 for (i = zsl->level; i < level; i++) {
2b37892e 5094 rank[i] = 0;
6b47e12e 5095 update[i] = zsl->header;
2b37892e 5096 update[i]->span[i-1] = zsl->length;
69d95c3e 5097 }
6b47e12e 5098 zsl->level = level;
5099 }
5100 x = zslCreateNode(level,score,obj);
5101 for (i = 0; i < level; i++) {
5102 x->forward[i] = update[i]->forward[i];
5103 update[i]->forward[i] = x;
69d95c3e
PN
5104
5105 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5106 if (i > 0) {
5107 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5108 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5109 }
6b47e12e 5110 }
69d95c3e
PN
5111
5112 /* increment span for untouched levels */
5113 for (i = level; i < zsl->level; i++) {
2b37892e 5114 update[i]->span[i-1]++;
69d95c3e
PN
5115 }
5116
bb975144 5117 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5118 if (x->forward[0])
5119 x->forward[0]->backward = x;
5120 else
5121 zsl->tail = x;
cc812361 5122 zsl->length++;
6b47e12e 5123}
5124
84105336
PN
5125/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5126void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5127 int i;
5128 for (i = 0; i < zsl->level; i++) {
5129 if (update[i]->forward[i] == x) {
5130 if (i > 0) {
5131 update[i]->span[i-1] += x->span[i-1] - 1;
5132 }
5133 update[i]->forward[i] = x->forward[i];
5134 } else {
5135 /* invariant: i > 0, because update[0]->forward[0]
5136 * is always equal to x */
5137 update[i]->span[i-1] -= 1;
5138 }
5139 }
5140 if (x->forward[0]) {
5141 x->forward[0]->backward = x->backward;
5142 } else {
5143 zsl->tail = x->backward;
5144 }
5145 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5146 zsl->level--;
5147 zsl->length--;
5148}
5149
50c55df5 5150/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5151static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5152 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5153 int i;
5154
5155 x = zsl->header;
5156 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5157 while (x->forward[i] &&
5158 (x->forward[i]->score < score ||
5159 (x->forward[i]->score == score &&
5160 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5161 x = x->forward[i];
5162 update[i] = x;
5163 }
5164 /* We may have multiple elements with the same score, what we need
5165 * is to find the element with both the right score and object. */
5166 x = x->forward[0];
50c55df5 5167 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
84105336 5168 zslDeleteNode(zsl, x, update);
9d60e6e4 5169 zslFreeNode(x);
9d60e6e4 5170 return 1;
5171 } else {
5172 return 0; /* not found */
e197b441 5173 }
5174 return 0; /* not found */
fd8ccf44 5175}
5176
1807985b 5177/* Delete all the elements with score between min and max from the skiplist.
5178 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5179 * Note that this function takes the reference to the hash table view of the
5180 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5181static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5182 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5183 unsigned long removed = 0;
5184 int i;
5185
5186 x = zsl->header;
5187 for (i = zsl->level-1; i >= 0; i--) {
5188 while (x->forward[i] && x->forward[i]->score < min)
5189 x = x->forward[i];
5190 update[i] = x;
5191 }
5192 /* We may have multiple elements with the same score, what we need
5193 * is to find the element with both the right score and object. */
5194 x = x->forward[0];
5195 while (x && x->score <= max) {
84105336
PN
5196 zskiplistNode *next = x->forward[0];
5197 zslDeleteNode(zsl, x, update);
1807985b 5198 dictDelete(dict,x->obj);
5199 zslFreeNode(x);
1807985b 5200 removed++;
5201 x = next;
5202 }
5203 return removed; /* not found */
5204}
1807985b 5205
9212eafd 5206/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5207 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5208static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5209 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5210 unsigned long traversed = 0, removed = 0;
5211 int i;
5212
9212eafd
PN
5213 x = zsl->header;
5214 for (i = zsl->level-1; i >= 0; i--) {
5215 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5216 traversed += i > 0 ? x->span[i-1] : 1;
5217 x = x->forward[i];
1807985b 5218 }
9212eafd
PN
5219 update[i] = x;
5220 }
5221
5222 traversed++;
5223 x = x->forward[0];
5224 while (x && traversed <= end) {
84105336
PN
5225 zskiplistNode *next = x->forward[0];
5226 zslDeleteNode(zsl, x, update);
1807985b 5227 dictDelete(dict,x->obj);
5228 zslFreeNode(x);
1807985b 5229 removed++;
9212eafd 5230 traversed++;
1807985b 5231 x = next;
5232 }
9212eafd 5233 return removed;
1807985b 5234}
5235
50c55df5 5236/* Find the first node having a score equal or greater than the specified one.
5237 * Returns NULL if there is no match. */
5238static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5239 zskiplistNode *x;
5240 int i;
5241
5242 x = zsl->header;
5243 for (i = zsl->level-1; i >= 0; i--) {
5244 while (x->forward[i] && x->forward[i]->score < score)
5245 x = x->forward[i];
5246 }
5247 /* We may have multiple elements with the same score, what we need
5248 * is to find the element with both the right score and object. */
5249 return x->forward[0];
5250}
5251
27b0ccca
PN
5252/* Find the rank for an element by both score and key.
5253 * Returns 0 when the element cannot be found, rank otherwise.
5254 * Note that the rank is 1-based due to the span of zsl->header to the
5255 * first element. */
5256static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5257 zskiplistNode *x;
5258 unsigned long rank = 0;
5259 int i;
5260
5261 x = zsl->header;
5262 for (i = zsl->level-1; i >= 0; i--) {
5263 while (x->forward[i] &&
5264 (x->forward[i]->score < score ||
5265 (x->forward[i]->score == score &&
5266 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5267 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5268 x = x->forward[i];
5269 }
5270
5271 /* x might be equal to zsl->header, so test if obj is non-NULL */
5272 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5273 return rank;
5274 }
5275 }
5276 return 0;
5277}
5278
e74825c2
PN
5279/* Finds an element by its rank. The rank argument needs to be 1-based. */
5280zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5281 zskiplistNode *x;
5282 unsigned long traversed = 0;
5283 int i;
5284
5285 x = zsl->header;
5286 for (i = zsl->level-1; i >= 0; i--) {
a50ea45c
PN
5287 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) <= rank) {
5288 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5289 x = x->forward[i];
5290 }
5291
5292 if (traversed == rank) {
5293 return x;
5294 }
5295 }
5296 return NULL;
5297}
5298
fd8ccf44 5299/* The actual Z-commands implementations */
5300
7db723ad 5301/* This generic command implements both ZADD and ZINCRBY.
e2665397 5302 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5303 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5304static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5305 robj *zsetobj;
5306 zset *zs;
5307 double *score;
5308
e2665397 5309 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5310 if (zsetobj == NULL) {
5311 zsetobj = createZsetObject();
e2665397 5312 dictAdd(c->db->dict,key,zsetobj);
5313 incrRefCount(key);
fd8ccf44 5314 } else {
5315 if (zsetobj->type != REDIS_ZSET) {
5316 addReply(c,shared.wrongtypeerr);
5317 return;
5318 }
5319 }
fd8ccf44 5320 zs = zsetobj->ptr;
e2665397 5321
7db723ad 5322 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5323 * needs to handle the two different conditions. It's all about setting
5324 * '*score', that is, the new score to set, to the right value. */
5325 score = zmalloc(sizeof(double));
5326 if (doincrement) {
5327 dictEntry *de;
5328
5329 /* Read the old score. If the element was not present starts from 0 */
5330 de = dictFind(zs->dict,ele);
5331 if (de) {
5332 double *oldscore = dictGetEntryVal(de);
5333 *score = *oldscore + scoreval;
5334 } else {
5335 *score = scoreval;
5336 }
5337 } else {
5338 *score = scoreval;
5339 }
5340
5341 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5342 * to both ZADD and ZINCRBY... */
e2665397 5343 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5344 /* case 1: New element */
e2665397 5345 incrRefCount(ele); /* added to hash */
5346 zslInsert(zs->zsl,*score,ele);
5347 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5348 server.dirty++;
e2665397 5349 if (doincrement)
e2665397 5350 addReplyDouble(c,*score);
91d71bfc 5351 else
5352 addReply(c,shared.cone);
fd8ccf44 5353 } else {
5354 dictEntry *de;
5355 double *oldscore;
5356
5357 /* case 2: Score update operation */
e2665397 5358 de = dictFind(zs->dict,ele);
dfc5e96c 5359 redisAssert(de != NULL);
fd8ccf44 5360 oldscore = dictGetEntryVal(de);
5361 if (*score != *oldscore) {
5362 int deleted;
5363
e2665397 5364 /* Remove and insert the element in the skip list with new score */
5365 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5366 redisAssert(deleted != 0);
e2665397 5367 zslInsert(zs->zsl,*score,ele);
5368 incrRefCount(ele);
5369 /* Update the score in the hash table */
5370 dictReplace(zs->dict,ele,score);
fd8ccf44 5371 server.dirty++;
2161a965 5372 } else {
5373 zfree(score);
fd8ccf44 5374 }
e2665397 5375 if (doincrement)
5376 addReplyDouble(c,*score);
5377 else
5378 addReply(c,shared.czero);
fd8ccf44 5379 }
5380}
5381
e2665397 5382static void zaddCommand(redisClient *c) {
5383 double scoreval;
5384
5385 scoreval = strtod(c->argv[2]->ptr,NULL);
5386 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5387}
5388
7db723ad 5389static void zincrbyCommand(redisClient *c) {
e2665397 5390 double scoreval;
5391
5392 scoreval = strtod(c->argv[2]->ptr,NULL);
5393 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5394}
5395
1b7106e7 5396static void zremCommand(redisClient *c) {
5397 robj *zsetobj;
5398 zset *zs;
5399
5400 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5401 if (zsetobj == NULL) {
5402 addReply(c,shared.czero);
5403 } else {
5404 dictEntry *de;
5405 double *oldscore;
5406 int deleted;
5407
5408 if (zsetobj->type != REDIS_ZSET) {
5409 addReply(c,shared.wrongtypeerr);
5410 return;
5411 }
5412 zs = zsetobj->ptr;
5413 de = dictFind(zs->dict,c->argv[2]);
5414 if (de == NULL) {
5415 addReply(c,shared.czero);
5416 return;
5417 }
5418 /* Delete from the skiplist */
5419 oldscore = dictGetEntryVal(de);
5420 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
dfc5e96c 5421 redisAssert(deleted != 0);
1b7106e7 5422
5423 /* Delete from the hash table */
5424 dictDelete(zs->dict,c->argv[2]);
5425 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5426 server.dirty++;
5427 addReply(c,shared.cone);
5428 }
5429}
5430
1807985b 5431static void zremrangebyscoreCommand(redisClient *c) {
5432 double min = strtod(c->argv[2]->ptr,NULL);
5433 double max = strtod(c->argv[3]->ptr,NULL);
5434 robj *zsetobj;
5435 zset *zs;
5436
5437 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5438 if (zsetobj == NULL) {
5439 addReply(c,shared.czero);
5440 } else {
5441 long deleted;
5442
5443 if (zsetobj->type != REDIS_ZSET) {
5444 addReply(c,shared.wrongtypeerr);
5445 return;
5446 }
5447 zs = zsetobj->ptr;
f84d3933 5448 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
1807985b 5449 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5450 server.dirty += deleted;
5451 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5452 }
5453}
5454
9212eafd
PN
5455static void zremrangebyrankCommand(redisClient *c) {
5456 int start = atoi(c->argv[2]->ptr);
5457 int end = atoi(c->argv[3]->ptr);
5458 robj *zsetobj;
5459 zset *zs;
5460
5461 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5462 if (zsetobj == NULL) {
5463 addReply(c,shared.czero);
5464 } else {
5465 if (zsetobj->type != REDIS_ZSET) {
5466 addReply(c,shared.wrongtypeerr);
5467 return;
5468 }
5469
5470 zs = zsetobj->ptr;
5471 int llen = zs->zsl->length;
5472 long deleted;
5473
5474 /* convert negative indexes */
5475 if (start < 0) start = llen+start;
5476 if (end < 0) end = llen+end;
5477 if (start < 0) start = 0;
5478 if (end < 0) end = 0;
5479
5480 /* indexes sanity checks */
5481 if (start > end || start >= llen) {
5482 addReply(c,shared.czero);
5483 return;
5484 }
5485 if (end >= llen) end = llen-1;
5486
2424490f
PN
5487 /* increment start and end because zsl*Rank functions
5488 * use 1-based rank */
5489 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
9212eafd
PN
5490 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5491 server.dirty += deleted;
5492 addReplyLong(c, deleted);
5493 }
5494}
5495
8f92e768
PN
5496typedef struct {
5497 dict *dict;
5498 double weight;
5499} zsetopsrc;
5500
5501static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5502 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5503 unsigned long size1, size2;
5504 size1 = d1->dict ? dictSize(d1->dict) : 0;
5505 size2 = d2->dict ? dictSize(d2->dict) : 0;
5506 return size1 - size2;
5507}
5508
2830ca53 5509static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768
PN
5510 int i, j, zsetnum;
5511 zsetopsrc *src;
2830ca53
PN
5512 robj *dstobj;
5513 zset *dstzset;
b287c9bb
PN
5514 dictIterator *di;
5515 dictEntry *de;
5516
2830ca53
PN
5517 /* expect zsetnum input keys to be given */
5518 zsetnum = atoi(c->argv[2]->ptr);
5519 if (zsetnum < 1) {
5520 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5521 return;
b287c9bb 5522 }
2830ca53
PN
5523
5524 /* test if the expected number of keys would overflow */
5525 if (3+zsetnum > c->argc) {
b287c9bb
PN
5526 addReply(c,shared.syntaxerr);
5527 return;
5528 }
5529
2830ca53 5530 /* read keys to be used for input */
b9eed483 5531 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5532 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5533 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5534 if (!zsetobj) {
8f92e768 5535 src[i].dict = NULL;
b287c9bb
PN
5536 } else {
5537 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5538 zfree(src);
b287c9bb
PN
5539 addReply(c,shared.wrongtypeerr);
5540 return;
5541 }
8f92e768 5542 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5543 }
2830ca53
PN
5544
5545 /* default all weights to 1 */
8f92e768 5546 src[i].weight = 1.0;
b287c9bb
PN
5547 }
5548
2830ca53
PN
5549 /* parse optional extra arguments */
5550 if (j < c->argc) {
5551 int remaining = c->argc-j;
b287c9bb 5552
2830ca53
PN
5553 while (remaining) {
5554 if (!strcasecmp(c->argv[j]->ptr,"weights")) {
5555 j++; remaining--;
5556 if (remaining < zsetnum) {
8f92e768 5557 zfree(src);
2830ca53
PN
5558 addReplySds(c,sdsnew("-ERR not enough weights for ZUNION/ZINTER\r\n"));
5559 return;
5560 }
5561 for (i = 0; i < zsetnum; i++, j++, remaining--) {
8f92e768 5562 src[i].weight = strtod(c->argv[j]->ptr, NULL);
2830ca53
PN
5563 }
5564 } else {
8f92e768 5565 zfree(src);
2830ca53
PN
5566 addReply(c,shared.syntaxerr);
5567 return;
5568 }
5569 }
5570 }
b287c9bb 5571
2830ca53
PN
5572 dstobj = createZsetObject();
5573 dstzset = dstobj->ptr;
5574
5575 if (op == REDIS_OP_INTER) {
8f92e768
PN
5576 /* sort sets from the smallest to largest, this will improve our
5577 * algorithm's performance */
5578 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5579
5580 /* skip going over all entries if the smallest zset is NULL or empty */
5581 if (src[0].dict && dictSize(src[0].dict) > 0) {
5582 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5583 * from small to large, all src[i > 0].dict are non-empty too */
5584 di = dictGetIterator(src[0].dict);
2830ca53
PN
5585 while((de = dictNext(di)) != NULL) {
5586 double *score = zmalloc(sizeof(double));
5587 *score = 0.0;
5588
8f92e768
PN
5589 for (j = 0; j < zsetnum; j++) {
5590 dictEntry *other = (j == 0) ? de : dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5591 if (other) {
8f92e768 5592 *score = *score + src[j].weight * (*(double*)dictGetEntryVal(other));
2830ca53
PN
5593 } else {
5594 break;
5595 }
5596 }
b287c9bb 5597
2830ca53 5598 /* skip entry when not present in every source dict */
8f92e768 5599 if (j != zsetnum) {
2830ca53
PN
5600 zfree(score);
5601 } else {
5602 robj *o = dictGetEntryKey(de);
5603 dictAdd(dstzset->dict,o,score);
5604 incrRefCount(o); /* added to dictionary */
5605 zslInsert(dstzset->zsl,*score,o);
5606 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
5607 }
5608 }
2830ca53
PN
5609 dictReleaseIterator(di);
5610 }
5611 } else if (op == REDIS_OP_UNION) {
5612 for (i = 0; i < zsetnum; i++) {
8f92e768 5613 if (!src[i].dict) continue;
2830ca53 5614
8f92e768 5615 di = dictGetIterator(src[i].dict);
2830ca53
PN
5616 while((de = dictNext(di)) != NULL) {
5617 /* skip key when already processed */
5618 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5619
5620 double *score = zmalloc(sizeof(double));
5621 *score = 0.0;
5622 for (j = 0; j < zsetnum; j++) {
8f92e768 5623 if (!src[j].dict) continue;
2830ca53 5624
8f92e768 5625 dictEntry *other = (i == j) ? de : dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5626 if (other) {
8f92e768 5627 *score = *score + src[j].weight * (*(double*)dictGetEntryVal(other));
2830ca53
PN
5628 }
5629 }
b287c9bb 5630
2830ca53
PN
5631 robj *o = dictGetEntryKey(de);
5632 dictAdd(dstzset->dict,o,score);
5633 incrRefCount(o); /* added to dictionary */
5634 zslInsert(dstzset->zsl,*score,o);
5635 incrRefCount(o); /* added to skiplist */
5636 }
5637 dictReleaseIterator(di);
b287c9bb 5638 }
2830ca53
PN
5639 } else {
5640 /* unknown operator */
5641 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
5642 }
5643
5644 deleteKey(c->db,dstkey);
5645 dictAdd(c->db->dict,dstkey,dstobj);
5646 incrRefCount(dstkey);
5647
2830ca53 5648 addReplyLong(c, dstzset->zsl->length);
b287c9bb 5649 server.dirty++;
8f92e768 5650 zfree(src);
b287c9bb
PN
5651}
5652
2830ca53
PN
5653static void zunionCommand(redisClient *c) {
5654 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
5655}
5656
2830ca53
PN
5657static void zinterCommand(redisClient *c) {
5658 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
5659}
5660
e3870fab 5661static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5662 robj *o;
5663 int start = atoi(c->argv[2]->ptr);
5664 int end = atoi(c->argv[3]->ptr);
752da584 5665 int withscores = 0;
5666
5667 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5668 withscores = 1;
5669 } else if (c->argc >= 5) {
5670 addReply(c,shared.syntaxerr);
5671 return;
5672 }
cc812361 5673
5674 o = lookupKeyRead(c->db,c->argv[1]);
5675 if (o == NULL) {
5676 addReply(c,shared.nullmultibulk);
5677 } else {
5678 if (o->type != REDIS_ZSET) {
5679 addReply(c,shared.wrongtypeerr);
5680 } else {
5681 zset *zsetobj = o->ptr;
5682 zskiplist *zsl = zsetobj->zsl;
5683 zskiplistNode *ln;
5684
5685 int llen = zsl->length;
5686 int rangelen, j;
5687 robj *ele;
5688
5689 /* convert negative indexes */
5690 if (start < 0) start = llen+start;
5691 if (end < 0) end = llen+end;
5692 if (start < 0) start = 0;
5693 if (end < 0) end = 0;
5694
5695 /* indexes sanity checks */
5696 if (start > end || start >= llen) {
5697 /* Out of range start or start > end result in empty list */
5698 addReply(c,shared.emptymultibulk);
5699 return;
5700 }
5701 if (end >= llen) end = llen-1;
5702 rangelen = (end-start)+1;
5703
edb51958
PN
5704 /* check if starting point is trivial, before searching
5705 * the element in log(N) time */
e3870fab 5706 if (reverse) {
2424490f 5707 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
e3870fab 5708 } else {
2424490f 5709 ln = start == 0 ? zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
e3870fab 5710 }
cc812361 5711
edb51958 5712 /* Return the result in form of a multi-bulk reply */
752da584 5713 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5714 withscores ? (rangelen*2) : rangelen));
cc812361 5715 for (j = 0; j < rangelen; j++) {
0aad7a19 5716 ele = ln->obj;
cc812361 5717 addReplyBulkLen(c,ele);
5718 addReply(c,ele);
5719 addReply(c,shared.crlf);
752da584 5720 if (withscores)
5721 addReplyDouble(c,ln->score);
e3870fab 5722 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5723 }
5724 }
5725 }
5726}
5727
e3870fab 5728static void zrangeCommand(redisClient *c) {
5729 zrangeGenericCommand(c,0);
5730}
5731
5732static void zrevrangeCommand(redisClient *c) {
5733 zrangeGenericCommand(c,1);
5734}
5735
f44dd428 5736/* This command implements both ZRANGEBYSCORE and ZCOUNT.
5737 * If justcount is non-zero, just the count is returned. */
5738static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 5739 robj *o;
f44dd428 5740 double min, max;
5741 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 5742 int offset = 0, limit = -1;
0500ef27
SH
5743 int withscores = 0;
5744 int badsyntax = 0;
5745
f44dd428 5746 /* Parse the min-max interval. If one of the values is prefixed
5747 * by the "(" character, it's considered "open". For instance
5748 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5749 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5750 if (((char*)c->argv[2]->ptr)[0] == '(') {
5751 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5752 minex = 1;
5753 } else {
5754 min = strtod(c->argv[2]->ptr,NULL);
5755 }
5756 if (((char*)c->argv[3]->ptr)[0] == '(') {
5757 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5758 maxex = 1;
5759 } else {
5760 max = strtod(c->argv[3]->ptr,NULL);
5761 }
5762
5763 /* Parse "WITHSCORES": note that if the command was called with
5764 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5765 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 5766 if (c->argc == 5 || c->argc == 8) {
3a3978b1 5767 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5768 withscores = 1;
5769 else
5770 badsyntax = 1;
0500ef27 5771 }
3a3978b1 5772 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 5773 badsyntax = 1;
0500ef27 5774 if (badsyntax) {
454d4e43 5775 addReplySds(c,
5776 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5777 return;
0500ef27
SH
5778 }
5779
f44dd428 5780 /* Parse "LIMIT" */
0500ef27 5781 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 5782 addReply(c,shared.syntaxerr);
5783 return;
0500ef27 5784 } else if (c->argc == (7 + withscores)) {
80181f78 5785 offset = atoi(c->argv[5]->ptr);
5786 limit = atoi(c->argv[6]->ptr);
0b13687c 5787 if (offset < 0) offset = 0;
80181f78 5788 }
50c55df5 5789
f44dd428 5790 /* Ok, lookup the key and get the range */
50c55df5 5791 o = lookupKeyRead(c->db,c->argv[1]);
5792 if (o == NULL) {
f44dd428 5793 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
50c55df5 5794 } else {
5795 if (o->type != REDIS_ZSET) {
5796 addReply(c,shared.wrongtypeerr);
5797 } else {
5798 zset *zsetobj = o->ptr;
5799 zskiplist *zsl = zsetobj->zsl;
5800 zskiplistNode *ln;
f44dd428 5801 robj *ele, *lenobj = NULL;
5802 unsigned long rangelen = 0;
50c55df5 5803
f44dd428 5804 /* Get the first node with the score >= min, or with
5805 * score > min if 'minex' is true. */
50c55df5 5806 ln = zslFirstWithScore(zsl,min);
f44dd428 5807 while (minex && ln && ln->score == min) ln = ln->forward[0];
5808
50c55df5 5809 if (ln == NULL) {
5810 /* No element matching the speciifed interval */
f44dd428 5811 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 5812 return;
5813 }
5814
5815 /* We don't know in advance how many matching elements there
5816 * are in the list, so we push this object that will represent
5817 * the multi-bulk length in the output buffer, and will "fix"
5818 * it later */
f44dd428 5819 if (!justcount) {
5820 lenobj = createObject(REDIS_STRING,NULL);
5821 addReply(c,lenobj);
5822 decrRefCount(lenobj);
5823 }
50c55df5 5824
f44dd428 5825 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 5826 if (offset) {
5827 offset--;
5828 ln = ln->forward[0];
5829 continue;
5830 }
5831 if (limit == 0) break;
f44dd428 5832 if (!justcount) {
5833 ele = ln->obj;
5834 addReplyBulkLen(c,ele);
5835 addReply(c,ele);
5836 addReply(c,shared.crlf);
5837 if (withscores)
5838 addReplyDouble(c,ln->score);
5839 }
50c55df5 5840 ln = ln->forward[0];
5841 rangelen++;
80181f78 5842 if (limit > 0) limit--;
50c55df5 5843 }
f44dd428 5844 if (justcount) {
5845 addReplyLong(c,(long)rangelen);
5846 } else {
5847 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5848 withscores ? (rangelen*2) : rangelen);
5849 }
50c55df5 5850 }
5851 }
5852}
5853
f44dd428 5854static void zrangebyscoreCommand(redisClient *c) {
5855 genericZrangebyscoreCommand(c,0);
5856}
5857
5858static void zcountCommand(redisClient *c) {
5859 genericZrangebyscoreCommand(c,1);
5860}
5861
3c41331e 5862static void zcardCommand(redisClient *c) {
e197b441 5863 robj *o;
5864 zset *zs;
5865
5866 o = lookupKeyRead(c->db,c->argv[1]);
5867 if (o == NULL) {
5868 addReply(c,shared.czero);
5869 return;
5870 } else {
5871 if (o->type != REDIS_ZSET) {
5872 addReply(c,shared.wrongtypeerr);
5873 } else {
5874 zs = o->ptr;
682ac724 5875 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
e197b441 5876 }
5877 }
5878}
5879
6e333bbe 5880static void zscoreCommand(redisClient *c) {
5881 robj *o;
5882 zset *zs;
5883
5884 o = lookupKeyRead(c->db,c->argv[1]);
5885 if (o == NULL) {
96d8b4ee 5886 addReply(c,shared.nullbulk);
6e333bbe 5887 return;
5888 } else {
5889 if (o->type != REDIS_ZSET) {
5890 addReply(c,shared.wrongtypeerr);
5891 } else {
5892 dictEntry *de;
5893
5894 zs = o->ptr;
5895 de = dictFind(zs->dict,c->argv[2]);
5896 if (!de) {
5897 addReply(c,shared.nullbulk);
5898 } else {
6e333bbe 5899 double *score = dictGetEntryVal(de);
5900
e2665397 5901 addReplyDouble(c,*score);
6e333bbe 5902 }
5903 }
5904 }
5905}
5906
798d9e55 5907static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e
PN
5908 robj *o;
5909 o = lookupKeyRead(c->db,c->argv[1]);
5910 if (o == NULL) {
5911 addReply(c,shared.nullbulk);
5912 return;
5913 }
5914 if (o->type != REDIS_ZSET) {
5915 addReply(c,shared.wrongtypeerr);
27b0ccca
PN
5916 } else {
5917 zset *zs = o->ptr;
5918 zskiplist *zsl = zs->zsl;
5919 dictEntry *de;
5920 unsigned long rank;
69d95c3e 5921
27b0ccca
PN
5922 de = dictFind(zs->dict,c->argv[2]);
5923 if (!de) {
5924 addReply(c,shared.nullbulk);
5925 return;
69d95c3e
PN
5926 }
5927
27b0ccca
PN
5928 double *score = dictGetEntryVal(de);
5929 rank = zslGetRank(zsl, *score, c->argv[2]);
5930 if (rank) {
798d9e55
PN
5931 if (reverse) {
5932 addReplyLong(c, zsl->length - rank);
5933 } else {
5934 addReplyLong(c, rank-1);
5935 }
27b0ccca
PN
5936 } else {
5937 addReply(c,shared.nullbulk);
69d95c3e 5938 }
978c2c94 5939 }
5940}
5941
798d9e55
PN
5942static void zrankCommand(redisClient *c) {
5943 zrankGenericCommand(c, 0);
5944}
5945
5946static void zrevrankCommand(redisClient *c) {
5947 zrankGenericCommand(c, 1);
5948}
5949
cbba7dd7 5950/* =================================== Hashes =============================== */
978c2c94 5951static void hsetCommand(redisClient *c) {
5952 int update = 0;
5953 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5954
5955 if (o == NULL) {
5956 o = createHashObject();
5957 dictAdd(c->db->dict,c->argv[1],o);
5958 incrRefCount(c->argv[1]);
5959 } else {
5960 if (o->type != REDIS_HASH) {
5961 addReply(c,shared.wrongtypeerr);
5962 return;
5963 }
5964 }
bae2c7ec 5965 /* We want to convert the zipmap into an hash table right now if the
5966 * entry to be added is too big. Note that we check if the object
5967 * is integer encoded before to try fetching the length in the test below.
5968 * This is because integers are small, but currently stringObjectLen()
5969 * performs a slow conversion: not worth it. */
5970 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5971 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5972 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5973 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5974 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5975 {
5976 convertToRealHash(o);
5977 }
5978
978c2c94 5979 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5980 unsigned char *zm = o->ptr;
b1befe6a 5981 robj *valobj = getDecodedObject(c->argv[3]);
978c2c94 5982
5983 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
b1befe6a 5984 valobj->ptr,sdslen(valobj->ptr),&update);
5985 decrRefCount(valobj);
cbba7dd7 5986 o->ptr = zm;
bae2c7ec 5987
5988 /* And here there is the second check for hash conversion...
5989 * we want to do it only if the operation was not just an update as
5990 * zipmapLen() is O(N). */
5991 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
5992 convertToRealHash(o);
978c2c94 5993 } else {
bae2c7ec 5994 tryObjectEncoding(c->argv[2]);
5995 /* note that c->argv[3] is already encoded, as the latest arg
5996 * of a bulk command is always integer encoded if possible. */
978c2c94 5997 if (dictAdd(o->ptr,c->argv[2],c->argv[3]) == DICT_OK) {
5998 incrRefCount(c->argv[2]);
5999 } else {
6000 update = 1;
6001 }
6002 incrRefCount(c->argv[3]);
6003 }
6004 server.dirty++;
6005 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6006}
6007
6008static void hgetCommand(redisClient *c) {
6009 robj *o = lookupKeyRead(c->db,c->argv[1]);
6010
6011 if (o == NULL) {
6012 addReply(c,shared.nullbulk);
6013 return;
6014 } else {
bcd11906 6015 if (o->type != REDIS_HASH) {
6016 addReply(c,shared.wrongtypeerr);
6017 return;
6018 }
6019
978c2c94 6020 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6021 unsigned char *zm = o->ptr;
6022 unsigned char *val;
6023 unsigned int vlen;
6024
6025 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr), &val,&vlen)) {
6026 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6027 addReplySds(c,sdsnewlen(val,vlen));
6028 addReply(c,shared.crlf);
6029 return;
6030 } else {
6031 addReply(c,shared.nullbulk);
6032 return;
6033 }
6034 } else {
6035 struct dictEntry *de;
6036
6037 de = dictFind(o->ptr,c->argv[2]);
6038 if (de == NULL) {
6039 addReply(c,shared.nullbulk);
6040 } else {
6041 robj *e = dictGetEntryVal(de);
6042
6043 addReplyBulkLen(c,e);
6044 addReply(c,e);
6045 addReply(c,shared.crlf);
6046 }
6047 }
69d95c3e 6048 }
69d95c3e
PN
6049}
6050
07efaf74 6051static void hdelCommand(redisClient *c) {
92b27fe9 6052 robj *o = lookupKeyWrite(c->db,c->argv[1]);
07efaf74 6053
6054 if (o == NULL) {
6055 addReply(c,shared.czero);
6056 return;
6057 } else {
6058 int deleted = 0;
6059
6060 if (o->type != REDIS_HASH) {
6061 addReply(c,shared.wrongtypeerr);
6062 return;
6063 }
6064
6065 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6066 o->ptr = zipmapDel((unsigned char*) o->ptr,
6067 (unsigned char*) c->argv[2]->ptr,
6068 sdslen(c->argv[2]->ptr), &deleted);
6069 } else {
6070 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6071 }
6072 addReply(c,deleted ? shared.cone : shared.czero);
6073 }
6074}
6075
92b27fe9 6076static void hlenCommand(redisClient *c) {
6077 robj *o;
6078 unsigned long len;
6079
6080 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6081 checkType(c,o,REDIS_HASH)) return;
6082
6083 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6084 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6085 addReplyUlong(c,len);
6086}
6087
ada386b2 6088static void convertToRealHash(robj *o) {
6089 unsigned char *key, *val, *p, *zm = o->ptr;
6090 unsigned int klen, vlen;
6091 dict *dict = dictCreate(&hashDictType,NULL);
6092
6093 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6094 p = zipmapRewind(zm);
6095 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6096 robj *keyobj, *valobj;
6097
6098 keyobj = createStringObject((char*)key,klen);
6099 valobj = createStringObject((char*)val,vlen);
6100 tryObjectEncoding(keyobj);
6101 tryObjectEncoding(valobj);
6102 dictAdd(dict,keyobj,valobj);
6103 }
6104 o->encoding = REDIS_ENCODING_HT;
6105 o->ptr = dict;
6106 zfree(zm);
6107}
6108
6b47e12e 6109/* ========================= Non type-specific commands ==================== */
6110
ed9b544e 6111static void flushdbCommand(redisClient *c) {
ca37e9cd 6112 server.dirty += dictSize(c->db->dict);
3305306f 6113 dictEmpty(c->db->dict);
6114 dictEmpty(c->db->expires);
ed9b544e 6115 addReply(c,shared.ok);
ed9b544e 6116}
6117
6118static void flushallCommand(redisClient *c) {
ca37e9cd 6119 server.dirty += emptyDb();
ed9b544e 6120 addReply(c,shared.ok);
f78fd11b 6121 rdbSave(server.dbfilename);
ca37e9cd 6122 server.dirty++;
ed9b544e 6123}
6124
56906eef 6125static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6126 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6127 so->type = type;
6128 so->pattern = pattern;
6129 return so;
6130}
6131
6132/* Return the value associated to the key with a name obtained
6133 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 6134static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 6135 char *p;
6136 sds spat, ssub;
6137 robj keyobj;
6138 int prefixlen, sublen, postfixlen;
ed9b544e 6139 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6140 struct {
f1017b3f 6141 long len;
6142 long free;
ed9b544e 6143 char buf[REDIS_SORTKEY_MAX+1];
6144 } keyname;
6145
28173a49 6146 /* If the pattern is "#" return the substitution object itself in order
6147 * to implement the "SORT ... GET #" feature. */
6148 spat = pattern->ptr;
6149 if (spat[0] == '#' && spat[1] == '\0') {
6150 return subst;
6151 }
6152
6153 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6154 * a decoded object on the fly. Otherwise getDecodedObject will just
6155 * increment the ref count, that we'll decrement later. */
6156 subst = getDecodedObject(subst);
942a3961 6157
ed9b544e 6158 ssub = subst->ptr;
6159 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6160 p = strchr(spat,'*');
ed5a857a 6161 if (!p) {
6162 decrRefCount(subst);
6163 return NULL;
6164 }
ed9b544e 6165
6166 prefixlen = p-spat;
6167 sublen = sdslen(ssub);
6168 postfixlen = sdslen(spat)-(prefixlen+1);
6169 memcpy(keyname.buf,spat,prefixlen);
6170 memcpy(keyname.buf+prefixlen,ssub,sublen);
6171 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6172 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6173 keyname.len = prefixlen+sublen+postfixlen;
6174
dfc5e96c 6175 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 6176 decrRefCount(subst);
6177
a4d1ba9a 6178 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 6179 return lookupKeyRead(db,&keyobj);
ed9b544e 6180}
6181
6182/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6183 * the additional parameter is not standard but a BSD-specific we have to
6184 * pass sorting parameters via the global 'server' structure */
6185static int sortCompare(const void *s1, const void *s2) {
6186 const redisSortObject *so1 = s1, *so2 = s2;
6187 int cmp;
6188
6189 if (!server.sort_alpha) {
6190 /* Numeric sorting. Here it's trivial as we precomputed scores */
6191 if (so1->u.score > so2->u.score) {
6192 cmp = 1;
6193 } else if (so1->u.score < so2->u.score) {
6194 cmp = -1;
6195 } else {
6196 cmp = 0;
6197 }
6198 } else {
6199 /* Alphanumeric sorting */
6200 if (server.sort_bypattern) {
6201 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6202 /* At least one compare object is NULL */
6203 if (so1->u.cmpobj == so2->u.cmpobj)
6204 cmp = 0;
6205 else if (so1->u.cmpobj == NULL)
6206 cmp = -1;
6207 else
6208 cmp = 1;
6209 } else {
6210 /* We have both the objects, use strcoll */
6211 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6212 }
6213 } else {
6214 /* Compare elements directly */
9d65a1bb 6215 robj *dec1, *dec2;
6216
6217 dec1 = getDecodedObject(so1->obj);
6218 dec2 = getDecodedObject(so2->obj);
6219 cmp = strcoll(dec1->ptr,dec2->ptr);
6220 decrRefCount(dec1);
6221 decrRefCount(dec2);
ed9b544e 6222 }
6223 }
6224 return server.sort_desc ? -cmp : cmp;
6225}
6226
6227/* The SORT command is the most complex command in Redis. Warning: this code
6228 * is optimized for speed and a bit less for readability */
6229static void sortCommand(redisClient *c) {
ed9b544e 6230 list *operations;
6231 int outputlen = 0;
6232 int desc = 0, alpha = 0;
6233 int limit_start = 0, limit_count = -1, start, end;
6234 int j, dontsort = 0, vectorlen;
6235 int getop = 0; /* GET operation counter */
443c6409 6236 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6237 redisSortObject *vector; /* Resulting vector to sort */
6238
6239 /* Lookup the key to sort. It must be of the right types */
3305306f 6240 sortval = lookupKeyRead(c->db,c->argv[1]);
6241 if (sortval == NULL) {
d922ae65 6242 addReply(c,shared.nullmultibulk);
ed9b544e 6243 return;
6244 }
a5eb649b 6245 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6246 sortval->type != REDIS_ZSET)
6247 {
c937aa89 6248 addReply(c,shared.wrongtypeerr);
ed9b544e 6249 return;
6250 }
6251
6252 /* Create a list of operations to perform for every sorted element.
6253 * Operations can be GET/DEL/INCR/DECR */
6254 operations = listCreate();
092dac2a 6255 listSetFreeMethod(operations,zfree);
ed9b544e 6256 j = 2;
6257
6258 /* Now we need to protect sortval incrementing its count, in the future
6259 * SORT may have options able to overwrite/delete keys during the sorting
6260 * and the sorted key itself may get destroied */
6261 incrRefCount(sortval);
6262
6263 /* The SORT command has an SQL-alike syntax, parse it */
6264 while(j < c->argc) {
6265 int leftargs = c->argc-j-1;
6266 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6267 desc = 0;
6268 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6269 desc = 1;
6270 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6271 alpha = 1;
6272 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6273 limit_start = atoi(c->argv[j+1]->ptr);
6274 limit_count = atoi(c->argv[j+2]->ptr);
6275 j+=2;
443c6409 6276 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6277 storekey = c->argv[j+1];
6278 j++;
ed9b544e 6279 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6280 sortby = c->argv[j+1];
6281 /* If the BY pattern does not contain '*', i.e. it is constant,
6282 * we don't need to sort nor to lookup the weight keys. */
6283 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6284 j++;
6285 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6286 listAddNodeTail(operations,createSortOperation(
6287 REDIS_SORT_GET,c->argv[j+1]));
6288 getop++;
6289 j++;
ed9b544e 6290 } else {
6291 decrRefCount(sortval);
6292 listRelease(operations);
c937aa89 6293 addReply(c,shared.syntaxerr);
ed9b544e 6294 return;
6295 }
6296 j++;
6297 }
6298
6299 /* Load the sorting vector with all the objects to sort */
a5eb649b 6300 switch(sortval->type) {
6301 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6302 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6303 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 6304 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 6305 }
ed9b544e 6306 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6307 j = 0;
a5eb649b 6308
ed9b544e 6309 if (sortval->type == REDIS_LIST) {
6310 list *list = sortval->ptr;
6208b3a7 6311 listNode *ln;
c7df85a4 6312 listIter li;
6208b3a7 6313
c7df85a4 6314 listRewind(list,&li);
6315 while((ln = listNext(&li))) {
ed9b544e 6316 robj *ele = ln->value;
6317 vector[j].obj = ele;
6318 vector[j].u.score = 0;
6319 vector[j].u.cmpobj = NULL;
ed9b544e 6320 j++;
6321 }
6322 } else {
a5eb649b 6323 dict *set;
ed9b544e 6324 dictIterator *di;
6325 dictEntry *setele;
6326
a5eb649b 6327 if (sortval->type == REDIS_SET) {
6328 set = sortval->ptr;
6329 } else {
6330 zset *zs = sortval->ptr;
6331 set = zs->dict;
6332 }
6333
ed9b544e 6334 di = dictGetIterator(set);
ed9b544e 6335 while((setele = dictNext(di)) != NULL) {
6336 vector[j].obj = dictGetEntryKey(setele);
6337 vector[j].u.score = 0;
6338 vector[j].u.cmpobj = NULL;
6339 j++;
6340 }
6341 dictReleaseIterator(di);
6342 }
dfc5e96c 6343 redisAssert(j == vectorlen);
ed9b544e 6344
6345 /* Now it's time to load the right scores in the sorting vector */
6346 if (dontsort == 0) {
6347 for (j = 0; j < vectorlen; j++) {
6348 if (sortby) {
6349 robj *byval;
6350
3305306f 6351 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 6352 if (!byval || byval->type != REDIS_STRING) continue;
6353 if (alpha) {
9d65a1bb 6354 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 6355 } else {
942a3961 6356 if (byval->encoding == REDIS_ENCODING_RAW) {
6357 vector[j].u.score = strtod(byval->ptr,NULL);
6358 } else {
9d65a1bb 6359 /* Don't need to decode the object if it's
6360 * integer-encoded (the only encoding supported) so
6361 * far. We can just cast it */
f1017b3f 6362 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 6363 vector[j].u.score = (long)byval->ptr;
f1017b3f 6364 } else
dfc5e96c 6365 redisAssert(1 != 1);
942a3961 6366 }
ed9b544e 6367 }
6368 } else {
942a3961 6369 if (!alpha) {
6370 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6371 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6372 else {
6373 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6374 vector[j].u.score = (long) vector[j].obj->ptr;
6375 else
dfc5e96c 6376 redisAssert(1 != 1);
942a3961 6377 }
6378 }
ed9b544e 6379 }
6380 }
6381 }
6382
6383 /* We are ready to sort the vector... perform a bit of sanity check
6384 * on the LIMIT option too. We'll use a partial version of quicksort. */
6385 start = (limit_start < 0) ? 0 : limit_start;
6386 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6387 if (start >= vectorlen) {
6388 start = vectorlen-1;
6389 end = vectorlen-2;
6390 }
6391 if (end >= vectorlen) end = vectorlen-1;
6392
6393 if (dontsort == 0) {
6394 server.sort_desc = desc;
6395 server.sort_alpha = alpha;
6396 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 6397 if (sortby && (start != 0 || end != vectorlen-1))
6398 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6399 else
6400 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 6401 }
6402
6403 /* Send command output to the output buffer, performing the specified
6404 * GET/DEL/INCR/DECR operations if any. */
6405 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 6406 if (storekey == NULL) {
6407 /* STORE option not specified, sent the sorting result to client */
6408 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6409 for (j = start; j <= end; j++) {
6410 listNode *ln;
c7df85a4 6411 listIter li;
6412
443c6409 6413 if (!getop) {
6414 addReplyBulkLen(c,vector[j].obj);
6415 addReply(c,vector[j].obj);
6416 addReply(c,shared.crlf);
6417 }
c7df85a4 6418 listRewind(operations,&li);
6419 while((ln = listNext(&li))) {
443c6409 6420 redisSortOperation *sop = ln->value;
6421 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6422 vector[j].obj);
6423
6424 if (sop->type == REDIS_SORT_GET) {
6425 if (!val || val->type != REDIS_STRING) {
6426 addReply(c,shared.nullbulk);
6427 } else {
6428 addReplyBulkLen(c,val);
6429 addReply(c,val);
6430 addReply(c,shared.crlf);
6431 }
6432 } else {
dfc5e96c 6433 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 6434 }
6435 }
ed9b544e 6436 }
443c6409 6437 } else {
6438 robj *listObject = createListObject();
6439 list *listPtr = (list*) listObject->ptr;
6440
6441 /* STORE option specified, set the sorting result as a List object */
6442 for (j = start; j <= end; j++) {
6443 listNode *ln;
c7df85a4 6444 listIter li;
6445
443c6409 6446 if (!getop) {
6447 listAddNodeTail(listPtr,vector[j].obj);
6448 incrRefCount(vector[j].obj);
6449 }
c7df85a4 6450 listRewind(operations,&li);
6451 while((ln = listNext(&li))) {
443c6409 6452 redisSortOperation *sop = ln->value;
6453 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6454 vector[j].obj);
6455
6456 if (sop->type == REDIS_SORT_GET) {
6457 if (!val || val->type != REDIS_STRING) {
6458 listAddNodeTail(listPtr,createStringObject("",0));
6459 } else {
6460 listAddNodeTail(listPtr,val);
6461 incrRefCount(val);
6462 }
ed9b544e 6463 } else {
dfc5e96c 6464 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 6465 }
ed9b544e 6466 }
ed9b544e 6467 }
121796f7 6468 if (dictReplace(c->db->dict,storekey,listObject)) {
6469 incrRefCount(storekey);
6470 }
443c6409 6471 /* Note: we add 1 because the DB is dirty anyway since even if the
6472 * SORT result is empty a new key is set and maybe the old content
6473 * replaced. */
6474 server.dirty += 1+outputlen;
6475 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 6476 }
6477
6478 /* Cleanup */
6479 decrRefCount(sortval);
6480 listRelease(operations);
6481 for (j = 0; j < vectorlen; j++) {
6482 if (sortby && alpha && vector[j].u.cmpobj)
6483 decrRefCount(vector[j].u.cmpobj);
6484 }
6485 zfree(vector);
6486}
6487
ec6c7a1d 6488/* Convert an amount of bytes into a human readable string in the form
6489 * of 100B, 2G, 100M, 4K, and so forth. */
6490static void bytesToHuman(char *s, unsigned long long n) {
6491 double d;
6492
6493 if (n < 1024) {
6494 /* Bytes */
6495 sprintf(s,"%lluB",n);
6496 return;
6497 } else if (n < (1024*1024)) {
6498 d = (double)n/(1024);
6499 sprintf(s,"%.2fK",d);
6500 } else if (n < (1024LL*1024*1024)) {
6501 d = (double)n/(1024*1024);
6502 sprintf(s,"%.2fM",d);
6503 } else if (n < (1024LL*1024*1024*1024)) {
6504 d = (double)n/(1024LL*1024*1024);
b72f6a4b 6505 sprintf(s,"%.2fG",d);
ec6c7a1d 6506 }
6507}
6508
1c85b79f 6509/* Create the string returned by the INFO command. This is decoupled
6510 * by the INFO command itself as we need to report the same information
6511 * on memory corruption problems. */
6512static sds genRedisInfoString(void) {
ed9b544e 6513 sds info;
6514 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 6515 int j;
ec6c7a1d 6516 char hmem[64];
55a8298f 6517
6518 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
6519 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
ec6c7a1d 6520
b72f6a4b 6521 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 6522 info = sdscatprintf(sdsempty(),
6523 "redis_version:%s\r\n"
f1017b3f 6524 "arch_bits:%s\r\n"
7a932b74 6525 "multiplexing_api:%s\r\n"
0d7170a4 6526 "process_id:%ld\r\n"
682ac724 6527 "uptime_in_seconds:%ld\r\n"
6528 "uptime_in_days:%ld\r\n"
ed9b544e 6529 "connected_clients:%d\r\n"
6530 "connected_slaves:%d\r\n"
f86a74e9 6531 "blocked_clients:%d\r\n"
5fba9f71 6532 "used_memory:%zu\r\n"
ec6c7a1d 6533 "used_memory_human:%s\r\n"
ed9b544e 6534 "changes_since_last_save:%lld\r\n"
be2bb6b0 6535 "bgsave_in_progress:%d\r\n"
682ac724 6536 "last_save_time:%ld\r\n"
b3fad521 6537 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 6538 "total_connections_received:%lld\r\n"
6539 "total_commands_processed:%lld\r\n"
55a8298f 6540 "hash_max_zipmap_entries:%ld\r\n"
6541 "hash_max_zipmap_value:%ld\r\n"
7d98e08c 6542 "vm_enabled:%d\r\n"
a0f643ea 6543 "role:%s\r\n"
ed9b544e 6544 ,REDIS_VERSION,
f1017b3f 6545 (sizeof(long) == 8) ? "64" : "32",
7a932b74 6546 aeGetApiName(),
0d7170a4 6547 (long) getpid(),
a0f643ea 6548 uptime,
6549 uptime/(3600*24),
ed9b544e 6550 listLength(server.clients)-listLength(server.slaves),
6551 listLength(server.slaves),
d5d55fc3 6552 server.blpop_blocked_clients,
b72f6a4b 6553 zmalloc_used_memory(),
ec6c7a1d 6554 hmem,
ed9b544e 6555 server.dirty,
9d65a1bb 6556 server.bgsavechildpid != -1,
ed9b544e 6557 server.lastsave,
b3fad521 6558 server.bgrewritechildpid != -1,
ed9b544e 6559 server.stat_numconnections,
6560 server.stat_numcommands,
55a8298f 6561 server.hash_max_zipmap_entries,
6562 server.hash_max_zipmap_value,
7d98e08c 6563 server.vm_enabled != 0,
a0f643ea 6564 server.masterhost == NULL ? "master" : "slave"
ed9b544e 6565 );
a0f643ea 6566 if (server.masterhost) {
6567 info = sdscatprintf(info,
6568 "master_host:%s\r\n"
6569 "master_port:%d\r\n"
6570 "master_link_status:%s\r\n"
6571 "master_last_io_seconds_ago:%d\r\n"
6572 ,server.masterhost,
6573 server.masterport,
6574 (server.replstate == REDIS_REPL_CONNECTED) ?
6575 "up" : "down",
f72b934d 6576 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 6577 );
6578 }
7d98e08c 6579 if (server.vm_enabled) {
1064ef87 6580 lockThreadedIO();
7d98e08c 6581 info = sdscatprintf(info,
6582 "vm_conf_max_memory:%llu\r\n"
6583 "vm_conf_page_size:%llu\r\n"
6584 "vm_conf_pages:%llu\r\n"
6585 "vm_stats_used_pages:%llu\r\n"
6586 "vm_stats_swapped_objects:%llu\r\n"
6587 "vm_stats_swappin_count:%llu\r\n"
6588 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 6589 "vm_stats_io_newjobs_len:%lu\r\n"
6590 "vm_stats_io_processing_len:%lu\r\n"
6591 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 6592 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 6593 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 6594 ,(unsigned long long) server.vm_max_memory,
6595 (unsigned long long) server.vm_page_size,
6596 (unsigned long long) server.vm_pages,
6597 (unsigned long long) server.vm_stats_used_pages,
6598 (unsigned long long) server.vm_stats_swapped_objects,
6599 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 6600 (unsigned long long) server.vm_stats_swapouts,
6601 (unsigned long) listLength(server.io_newjobs),
6602 (unsigned long) listLength(server.io_processing),
6603 (unsigned long) listLength(server.io_processed),
d5d55fc3 6604 (unsigned long) server.io_active_threads,
6605 (unsigned long) server.vm_blocked_clients
7d98e08c 6606 );
1064ef87 6607 unlockThreadedIO();
7d98e08c 6608 }
c3cb078d 6609 for (j = 0; j < server.dbnum; j++) {
6610 long long keys, vkeys;
6611
6612 keys = dictSize(server.db[j].dict);
6613 vkeys = dictSize(server.db[j].expires);
6614 if (keys || vkeys) {
9d65a1bb 6615 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 6616 j, keys, vkeys);
6617 }
6618 }
1c85b79f 6619 return info;
6620}
6621
6622static void infoCommand(redisClient *c) {
6623 sds info = genRedisInfoString();
83c6a618 6624 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6625 (unsigned long)sdslen(info)));
ed9b544e 6626 addReplySds(c,info);
70003d28 6627 addReply(c,shared.crlf);
ed9b544e 6628}
6629
3305306f 6630static void monitorCommand(redisClient *c) {
6631 /* ignore MONITOR if aleady slave or in monitor mode */
6632 if (c->flags & REDIS_SLAVE) return;
6633
6634 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6635 c->slaveseldb = 0;
6b47e12e 6636 listAddNodeTail(server.monitors,c);
3305306f 6637 addReply(c,shared.ok);
6638}
6639
6640/* ================================= Expire ================================= */
6641static int removeExpire(redisDb *db, robj *key) {
6642 if (dictDelete(db->expires,key) == DICT_OK) {
6643 return 1;
6644 } else {
6645 return 0;
6646 }
6647}
6648
6649static int setExpire(redisDb *db, robj *key, time_t when) {
6650 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6651 return 0;
6652 } else {
6653 incrRefCount(key);
6654 return 1;
6655 }
6656}
6657
bb32ede5 6658/* Return the expire time of the specified key, or -1 if no expire
6659 * is associated with this key (i.e. the key is non volatile) */
6660static time_t getExpire(redisDb *db, robj *key) {
6661 dictEntry *de;
6662
6663 /* No expire? return ASAP */
6664 if (dictSize(db->expires) == 0 ||
6665 (de = dictFind(db->expires,key)) == NULL) return -1;
6666
6667 return (time_t) dictGetEntryVal(de);
6668}
6669
3305306f 6670static int expireIfNeeded(redisDb *db, robj *key) {
6671 time_t when;
6672 dictEntry *de;
6673
6674 /* No expire? return ASAP */
6675 if (dictSize(db->expires) == 0 ||
6676 (de = dictFind(db->expires,key)) == NULL) return 0;
6677
6678 /* Lookup the expire */
6679 when = (time_t) dictGetEntryVal(de);
6680 if (time(NULL) <= when) return 0;
6681
6682 /* Delete the key */
6683 dictDelete(db->expires,key);
6684 return dictDelete(db->dict,key) == DICT_OK;
6685}
6686
6687static int deleteIfVolatile(redisDb *db, robj *key) {
6688 dictEntry *de;
6689
6690 /* No expire? return ASAP */
6691 if (dictSize(db->expires) == 0 ||
6692 (de = dictFind(db->expires,key)) == NULL) return 0;
6693
6694 /* Delete the key */
0c66a471 6695 server.dirty++;
3305306f 6696 dictDelete(db->expires,key);
6697 return dictDelete(db->dict,key) == DICT_OK;
6698}
6699
802e8373 6700static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
3305306f 6701 dictEntry *de;
3305306f 6702
802e8373 6703 de = dictFind(c->db->dict,key);
3305306f 6704 if (de == NULL) {
6705 addReply(c,shared.czero);
6706 return;
6707 }
43e5ccdf 6708 if (seconds < 0) {
6709 if (deleteKey(c->db,key)) server.dirty++;
6710 addReply(c, shared.cone);
3305306f 6711 return;
6712 } else {
6713 time_t when = time(NULL)+seconds;
802e8373 6714 if (setExpire(c->db,key,when)) {
3305306f 6715 addReply(c,shared.cone);
77423026 6716 server.dirty++;
6717 } else {
3305306f 6718 addReply(c,shared.czero);
77423026 6719 }
3305306f 6720 return;
6721 }
6722}
6723
802e8373 6724static void expireCommand(redisClient *c) {
6725 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6726}
6727
6728static void expireatCommand(redisClient *c) {
6729 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6730}
6731
fd88489a 6732static void ttlCommand(redisClient *c) {
6733 time_t expire;
6734 int ttl = -1;
6735
6736 expire = getExpire(c->db,c->argv[1]);
6737 if (expire != -1) {
6738 ttl = (int) (expire-time(NULL));
6739 if (ttl < 0) ttl = -1;
6740 }
6741 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6742}
6743
6e469882 6744/* ================================ MULTI/EXEC ============================== */
6745
6746/* Client state initialization for MULTI/EXEC */
6747static void initClientMultiState(redisClient *c) {
6748 c->mstate.commands = NULL;
6749 c->mstate.count = 0;
6750}
6751
6752/* Release all the resources associated with MULTI/EXEC state */
6753static void freeClientMultiState(redisClient *c) {
6754 int j;
6755
6756 for (j = 0; j < c->mstate.count; j++) {
6757 int i;
6758 multiCmd *mc = c->mstate.commands+j;
6759
6760 for (i = 0; i < mc->argc; i++)
6761 decrRefCount(mc->argv[i]);
6762 zfree(mc->argv);
6763 }
6764 zfree(c->mstate.commands);
6765}
6766
6767/* Add a new command into the MULTI commands queue */
6768static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6769 multiCmd *mc;
6770 int j;
6771
6772 c->mstate.commands = zrealloc(c->mstate.commands,
6773 sizeof(multiCmd)*(c->mstate.count+1));
6774 mc = c->mstate.commands+c->mstate.count;
6775 mc->cmd = cmd;
6776 mc->argc = c->argc;
6777 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6778 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6779 for (j = 0; j < c->argc; j++)
6780 incrRefCount(mc->argv[j]);
6781 c->mstate.count++;
6782}
6783
6784static void multiCommand(redisClient *c) {
6785 c->flags |= REDIS_MULTI;
36c548f0 6786 addReply(c,shared.ok);
6e469882 6787}
6788
18b6cb76
DJ
6789static void discardCommand(redisClient *c) {
6790 if (!(c->flags & REDIS_MULTI)) {
6791 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6792 return;
6793 }
6794
6795 freeClientMultiState(c);
6796 initClientMultiState(c);
6797 c->flags &= (~REDIS_MULTI);
6798 addReply(c,shared.ok);
6799}
6800
6e469882 6801static void execCommand(redisClient *c) {
6802 int j;
6803 robj **orig_argv;
6804 int orig_argc;
6805
6806 if (!(c->flags & REDIS_MULTI)) {
6807 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6808 return;
6809 }
6810
6811 orig_argv = c->argv;
6812 orig_argc = c->argc;
6813 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6814 for (j = 0; j < c->mstate.count; j++) {
6815 c->argc = c->mstate.commands[j].argc;
6816 c->argv = c->mstate.commands[j].argv;
6817 call(c,c->mstate.commands[j].cmd);
6818 }
6819 c->argv = orig_argv;
6820 c->argc = orig_argc;
6821 freeClientMultiState(c);
6822 initClientMultiState(c);
6823 c->flags &= (~REDIS_MULTI);
6824}
6825
4409877e 6826/* =========================== Blocking Operations ========================= */
6827
6828/* Currently Redis blocking operations support is limited to list POP ops,
6829 * so the current implementation is not fully generic, but it is also not
6830 * completely specific so it will not require a rewrite to support new
6831 * kind of blocking operations in the future.
6832 *
6833 * Still it's important to note that list blocking operations can be already
6834 * used as a notification mechanism in order to implement other blocking
6835 * operations at application level, so there must be a very strong evidence
6836 * of usefulness and generality before new blocking operations are implemented.
6837 *
6838 * This is how the current blocking POP works, we use BLPOP as example:
6839 * - If the user calls BLPOP and the key exists and contains a non empty list
6840 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6841 * if there is not to block.
6842 * - If instead BLPOP is called and the key does not exists or the list is
6843 * empty we need to block. In order to do so we remove the notification for
6844 * new data to read in the client socket (so that we'll not serve new
6845 * requests if the blocking request is not served). Also we put the client
95242ab5 6846 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 6847 * blocking for this keys.
6848 * - If a PUSH operation against a key with blocked clients waiting is
6849 * performed, we serve the first in the list: basically instead to push
6850 * the new element inside the list we return it to the (first / oldest)
6851 * blocking client, unblock the client, and remove it form the list.
6852 *
6853 * The above comment and the source code should be enough in order to understand
6854 * the implementation and modify / fix it later.
6855 */
6856
6857/* Set a client in blocking mode for the specified key, with the specified
6858 * timeout */
b177fd30 6859static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 6860 dictEntry *de;
6861 list *l;
b177fd30 6862 int j;
4409877e 6863
b177fd30 6864 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6865 c->blockingkeysnum = numkeys;
4409877e 6866 c->blockingto = timeout;
b177fd30 6867 for (j = 0; j < numkeys; j++) {
6868 /* Add the key in the client structure, to map clients -> keys */
6869 c->blockingkeys[j] = keys[j];
6870 incrRefCount(keys[j]);
4409877e 6871
b177fd30 6872 /* And in the other "side", to map keys -> clients */
6873 de = dictFind(c->db->blockingkeys,keys[j]);
6874 if (de == NULL) {
6875 int retval;
6876
6877 /* For every key we take a list of clients blocked for it */
6878 l = listCreate();
6879 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6880 incrRefCount(keys[j]);
6881 assert(retval == DICT_OK);
6882 } else {
6883 l = dictGetEntryVal(de);
6884 }
6885 listAddNodeTail(l,c);
4409877e 6886 }
b177fd30 6887 /* Mark the client as a blocked client */
4409877e 6888 c->flags |= REDIS_BLOCKED;
d5d55fc3 6889 server.blpop_blocked_clients++;
4409877e 6890}
6891
6892/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 6893static void unblockClientWaitingData(redisClient *c) {
4409877e 6894 dictEntry *de;
6895 list *l;
b177fd30 6896 int j;
4409877e 6897
b177fd30 6898 assert(c->blockingkeys != NULL);
6899 /* The client may wait for multiple keys, so unblock it for every key. */
6900 for (j = 0; j < c->blockingkeysnum; j++) {
6901 /* Remove this client from the list of clients waiting for this key. */
6902 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6903 assert(de != NULL);
6904 l = dictGetEntryVal(de);
6905 listDelNode(l,listSearchKey(l,c));
6906 /* If the list is empty we need to remove it to avoid wasting memory */
6907 if (listLength(l) == 0)
6908 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6909 decrRefCount(c->blockingkeys[j]);
6910 }
6911 /* Cleanup the client structure */
6912 zfree(c->blockingkeys);
6913 c->blockingkeys = NULL;
4409877e 6914 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 6915 server.blpop_blocked_clients--;
5921aa36 6916 /* We want to process data if there is some command waiting
b0d8747d 6917 * in the input buffer. Note that this is safe even if
6918 * unblockClientWaitingData() gets called from freeClient() because
6919 * freeClient() will be smart enough to call this function
6920 * *after* c->querybuf was set to NULL. */
4409877e 6921 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6922}
6923
6924/* This should be called from any function PUSHing into lists.
6925 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6926 * 'ele' is the element pushed.
6927 *
6928 * If the function returns 0 there was no client waiting for a list push
6929 * against this key.
6930 *
6931 * If the function returns 1 there was a client waiting for a list push
6932 * against this key, the element was passed to this client thus it's not
6933 * needed to actually add it to the list and the caller should return asap. */
6934static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6935 struct dictEntry *de;
6936 redisClient *receiver;
6937 list *l;
6938 listNode *ln;
6939
6940 de = dictFind(c->db->blockingkeys,key);
6941 if (de == NULL) return 0;
6942 l = dictGetEntryVal(de);
6943 ln = listFirst(l);
6944 assert(ln != NULL);
6945 receiver = ln->value;
4409877e 6946
b177fd30 6947 addReplySds(receiver,sdsnew("*2\r\n"));
6948 addReplyBulkLen(receiver,key);
6949 addReply(receiver,key);
6950 addReply(receiver,shared.crlf);
4409877e 6951 addReplyBulkLen(receiver,ele);
6952 addReply(receiver,ele);
6953 addReply(receiver,shared.crlf);
b0d8747d 6954 unblockClientWaitingData(receiver);
4409877e 6955 return 1;
6956}
6957
6958/* Blocking RPOP/LPOP */
6959static void blockingPopGenericCommand(redisClient *c, int where) {
6960 robj *o;
6961 time_t timeout;
b177fd30 6962 int j;
4409877e 6963
b177fd30 6964 for (j = 1; j < c->argc-1; j++) {
6965 o = lookupKeyWrite(c->db,c->argv[j]);
6966 if (o != NULL) {
6967 if (o->type != REDIS_LIST) {
6968 addReply(c,shared.wrongtypeerr);
4409877e 6969 return;
b177fd30 6970 } else {
6971 list *list = o->ptr;
6972 if (listLength(list) != 0) {
6973 /* If the list contains elements fall back to the usual
6974 * non-blocking POP operation */
6975 robj *argv[2], **orig_argv;
6976 int orig_argc;
6977
6978 /* We need to alter the command arguments before to call
6979 * popGenericCommand() as the command takes a single key. */
6980 orig_argv = c->argv;
6981 orig_argc = c->argc;
6982 argv[1] = c->argv[j];
6983 c->argv = argv;
6984 c->argc = 2;
6985
6986 /* Also the return value is different, we need to output
6987 * the multi bulk reply header and the key name. The
6988 * "real" command will add the last element (the value)
6989 * for us. If this souds like an hack to you it's just
6990 * because it is... */
6991 addReplySds(c,sdsnew("*2\r\n"));
6992 addReplyBulkLen(c,argv[1]);
6993 addReply(c,argv[1]);
6994 addReply(c,shared.crlf);
6995 popGenericCommand(c,where);
6996
6997 /* Fix the client structure with the original stuff */
6998 c->argv = orig_argv;
6999 c->argc = orig_argc;
7000 return;
7001 }
4409877e 7002 }
7003 }
7004 }
7005 /* If the list is empty or the key does not exists we must block */
b177fd30 7006 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7007 if (timeout > 0) timeout += time(NULL);
b177fd30 7008 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7009}
7010
7011static void blpopCommand(redisClient *c) {
7012 blockingPopGenericCommand(c,REDIS_HEAD);
7013}
7014
7015static void brpopCommand(redisClient *c) {
7016 blockingPopGenericCommand(c,REDIS_TAIL);
7017}
7018
ed9b544e 7019/* =============================== Replication ============================= */
7020
a4d1ba9a 7021static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7022 ssize_t nwritten, ret = size;
7023 time_t start = time(NULL);
7024
7025 timeout++;
7026 while(size) {
7027 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7028 nwritten = write(fd,ptr,size);
7029 if (nwritten == -1) return -1;
7030 ptr += nwritten;
7031 size -= nwritten;
7032 }
7033 if ((time(NULL)-start) > timeout) {
7034 errno = ETIMEDOUT;
7035 return -1;
7036 }
7037 }
7038 return ret;
7039}
7040
a4d1ba9a 7041static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7042 ssize_t nread, totread = 0;
7043 time_t start = time(NULL);
7044
7045 timeout++;
7046 while(size) {
7047 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7048 nread = read(fd,ptr,size);
7049 if (nread == -1) return -1;
7050 ptr += nread;
7051 size -= nread;
7052 totread += nread;
7053 }
7054 if ((time(NULL)-start) > timeout) {
7055 errno = ETIMEDOUT;
7056 return -1;
7057 }
7058 }
7059 return totread;
7060}
7061
7062static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7063 ssize_t nread = 0;
7064
7065 size--;
7066 while(size) {
7067 char c;
7068
7069 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7070 if (c == '\n') {
7071 *ptr = '\0';
7072 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7073 return nread;
7074 } else {
7075 *ptr++ = c;
7076 *ptr = '\0';
7077 nread++;
7078 }
7079 }
7080 return nread;
7081}
7082
7083static void syncCommand(redisClient *c) {
40d224a9 7084 /* ignore SYNC if aleady slave or in monitor mode */
7085 if (c->flags & REDIS_SLAVE) return;
7086
7087 /* SYNC can't be issued when the server has pending data to send to
7088 * the client about already issued commands. We need a fresh reply
7089 * buffer registering the differences between the BGSAVE and the current
7090 * dataset, so that we can copy to other slaves if needed. */
7091 if (listLength(c->reply) != 0) {
7092 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7093 return;
7094 }
7095
7096 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7097 /* Here we need to check if there is a background saving operation
7098 * in progress, or if it is required to start one */
9d65a1bb 7099 if (server.bgsavechildpid != -1) {
40d224a9 7100 /* Ok a background save is in progress. Let's check if it is a good
7101 * one for replication, i.e. if there is another slave that is
7102 * registering differences since the server forked to save */
7103 redisClient *slave;
7104 listNode *ln;
c7df85a4 7105 listIter li;
40d224a9 7106
c7df85a4 7107 listRewind(server.slaves,&li);
7108 while((ln = listNext(&li))) {
40d224a9 7109 slave = ln->value;
7110 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7111 }
7112 if (ln) {
7113 /* Perfect, the server is already registering differences for
7114 * another slave. Set the right state, and copy the buffer. */
7115 listRelease(c->reply);
7116 c->reply = listDup(slave->reply);
40d224a9 7117 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7118 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7119 } else {
7120 /* No way, we need to wait for the next BGSAVE in order to
7121 * register differences */
7122 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7123 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7124 }
7125 } else {
7126 /* Ok we don't have a BGSAVE in progress, let's start one */
7127 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7128 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7129 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7130 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7131 return;
7132 }
7133 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7134 }
6208b3a7 7135 c->repldbfd = -1;
40d224a9 7136 c->flags |= REDIS_SLAVE;
7137 c->slaveseldb = 0;
6b47e12e 7138 listAddNodeTail(server.slaves,c);
40d224a9 7139 return;
7140}
7141
6208b3a7 7142static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7143 redisClient *slave = privdata;
7144 REDIS_NOTUSED(el);
7145 REDIS_NOTUSED(mask);
7146 char buf[REDIS_IOBUF_LEN];
7147 ssize_t nwritten, buflen;
7148
7149 if (slave->repldboff == 0) {
7150 /* Write the bulk write count before to transfer the DB. In theory here
7151 * we don't know how much room there is in the output buffer of the
7152 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7153 * operations) will never be smaller than the few bytes we need. */
7154 sds bulkcount;
7155
7156 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7157 slave->repldbsize);
7158 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7159 {
7160 sdsfree(bulkcount);
7161 freeClient(slave);
7162 return;
7163 }
7164 sdsfree(bulkcount);
7165 }
7166 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7167 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7168 if (buflen <= 0) {
7169 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7170 (buflen == 0) ? "premature EOF" : strerror(errno));
7171 freeClient(slave);
7172 return;
7173 }
7174 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7175 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7176 strerror(errno));
7177 freeClient(slave);
7178 return;
7179 }
7180 slave->repldboff += nwritten;
7181 if (slave->repldboff == slave->repldbsize) {
7182 close(slave->repldbfd);
7183 slave->repldbfd = -1;
7184 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7185 slave->replstate = REDIS_REPL_ONLINE;
7186 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7187 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7188 freeClient(slave);
7189 return;
7190 }
7191 addReplySds(slave,sdsempty());
7192 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7193 }
7194}
ed9b544e 7195
a3b21203 7196/* This function is called at the end of every backgrond saving.
7197 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7198 * otherwise REDIS_ERR is passed to the function.
7199 *
7200 * The goal of this function is to handle slaves waiting for a successful
7201 * background saving in order to perform non-blocking synchronization. */
7202static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7203 listNode *ln;
7204 int startbgsave = 0;
c7df85a4 7205 listIter li;
ed9b544e 7206
c7df85a4 7207 listRewind(server.slaves,&li);
7208 while((ln = listNext(&li))) {
6208b3a7 7209 redisClient *slave = ln->value;
ed9b544e 7210
6208b3a7 7211 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7212 startbgsave = 1;
7213 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7214 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7215 struct redis_stat buf;
6208b3a7 7216
7217 if (bgsaveerr != REDIS_OK) {
7218 freeClient(slave);
7219 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7220 continue;
7221 }
7222 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7223 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7224 freeClient(slave);
7225 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7226 continue;
7227 }
7228 slave->repldboff = 0;
7229 slave->repldbsize = buf.st_size;
7230 slave->replstate = REDIS_REPL_SEND_BULK;
7231 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7232 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7233 freeClient(slave);
7234 continue;
7235 }
7236 }
ed9b544e 7237 }
6208b3a7 7238 if (startbgsave) {
7239 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7240 listIter li;
7241
7242 listRewind(server.slaves,&li);
6208b3a7 7243 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7244 while((ln = listNext(&li))) {
6208b3a7 7245 redisClient *slave = ln->value;
ed9b544e 7246
6208b3a7 7247 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7248 freeClient(slave);
7249 }
7250 }
7251 }
ed9b544e 7252}
7253
7254static int syncWithMaster(void) {
d0ccebcf 7255 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7256 long dumpsize;
ed9b544e 7257 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7258 int dfd, maxtries = 5;
ed9b544e 7259
7260 if (fd == -1) {
7261 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7262 strerror(errno));
7263 return REDIS_ERR;
7264 }
d0ccebcf 7265
7266 /* AUTH with the master if required. */
7267 if(server.masterauth) {
7268 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7269 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7270 close(fd);
7271 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7272 strerror(errno));
7273 return REDIS_ERR;
7274 }
7275 /* Read the AUTH result. */
7276 if (syncReadLine(fd,buf,1024,3600) == -1) {
7277 close(fd);
7278 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7279 strerror(errno));
7280 return REDIS_ERR;
7281 }
7282 if (buf[0] != '+') {
7283 close(fd);
7284 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7285 return REDIS_ERR;
7286 }
7287 }
7288
ed9b544e 7289 /* Issue the SYNC command */
7290 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7291 close(fd);
7292 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7293 strerror(errno));
7294 return REDIS_ERR;
7295 }
7296 /* Read the bulk write count */
8c4d91fc 7297 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7298 close(fd);
7299 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7300 strerror(errno));
7301 return REDIS_ERR;
7302 }
4aa701c1 7303 if (buf[0] != '$') {
7304 close(fd);
7305 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7306 return REDIS_ERR;
7307 }
18e61fa2 7308 dumpsize = strtol(buf+1,NULL,10);
7309 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7310 /* Read the bulk write data on a temp file */
8c5abee8 7311 while(maxtries--) {
7312 snprintf(tmpfile,256,
7313 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7314 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7315 if (dfd != -1) break;
5de9ad7c 7316 sleep(1);
8c5abee8 7317 }
ed9b544e 7318 if (dfd == -1) {
7319 close(fd);
7320 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7321 return REDIS_ERR;
7322 }
7323 while(dumpsize) {
7324 int nread, nwritten;
7325
7326 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7327 if (nread == -1) {
7328 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7329 strerror(errno));
7330 close(fd);
7331 close(dfd);
7332 return REDIS_ERR;
7333 }
7334 nwritten = write(dfd,buf,nread);
7335 if (nwritten == -1) {
7336 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7337 close(fd);
7338 close(dfd);
7339 return REDIS_ERR;
7340 }
7341 dumpsize -= nread;
7342 }
7343 close(dfd);
7344 if (rename(tmpfile,server.dbfilename) == -1) {
7345 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7346 unlink(tmpfile);
7347 close(fd);
7348 return REDIS_ERR;
7349 }
7350 emptyDb();
f78fd11b 7351 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 7352 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7353 close(fd);
7354 return REDIS_ERR;
7355 }
7356 server.master = createClient(fd);
7357 server.master->flags |= REDIS_MASTER;
179b3952 7358 server.master->authenticated = 1;
ed9b544e 7359 server.replstate = REDIS_REPL_CONNECTED;
7360 return REDIS_OK;
7361}
7362
321b0e13 7363static void slaveofCommand(redisClient *c) {
7364 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7365 !strcasecmp(c->argv[2]->ptr,"one")) {
7366 if (server.masterhost) {
7367 sdsfree(server.masterhost);
7368 server.masterhost = NULL;
7369 if (server.master) freeClient(server.master);
7370 server.replstate = REDIS_REPL_NONE;
7371 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7372 }
7373 } else {
7374 sdsfree(server.masterhost);
7375 server.masterhost = sdsdup(c->argv[1]->ptr);
7376 server.masterport = atoi(c->argv[2]->ptr);
7377 if (server.master) freeClient(server.master);
7378 server.replstate = REDIS_REPL_CONNECT;
7379 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7380 server.masterhost, server.masterport);
7381 }
7382 addReply(c,shared.ok);
7383}
7384
3fd78bcd 7385/* ============================ Maxmemory directive ======================== */
7386
a5819310 7387/* Try to free one object form the pre-allocated objects free list.
7388 * This is useful under low mem conditions as by default we take 1 million
7389 * free objects allocated. On success REDIS_OK is returned, otherwise
7390 * REDIS_ERR. */
7391static int tryFreeOneObjectFromFreelist(void) {
f870935d 7392 robj *o;
7393
a5819310 7394 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7395 if (listLength(server.objfreelist)) {
7396 listNode *head = listFirst(server.objfreelist);
7397 o = listNodeValue(head);
7398 listDelNode(server.objfreelist,head);
7399 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7400 zfree(o);
7401 return REDIS_OK;
7402 } else {
7403 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7404 return REDIS_ERR;
7405 }
f870935d 7406}
7407
3fd78bcd 7408/* This function gets called when 'maxmemory' is set on the config file to limit
7409 * the max memory used by the server, and we are out of memory.
7410 * This function will try to, in order:
7411 *
7412 * - Free objects from the free list
7413 * - Try to remove keys with an EXPIRE set
7414 *
7415 * It is not possible to free enough memory to reach used-memory < maxmemory
7416 * the server will start refusing commands that will enlarge even more the
7417 * memory usage.
7418 */
7419static void freeMemoryIfNeeded(void) {
7420 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 7421 int j, k, freed = 0;
7422
7423 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7424 for (j = 0; j < server.dbnum; j++) {
7425 int minttl = -1;
7426 robj *minkey = NULL;
7427 struct dictEntry *de;
7428
7429 if (dictSize(server.db[j].expires)) {
7430 freed = 1;
7431 /* From a sample of three keys drop the one nearest to
7432 * the natural expire */
7433 for (k = 0; k < 3; k++) {
7434 time_t t;
7435
7436 de = dictGetRandomKey(server.db[j].expires);
7437 t = (time_t) dictGetEntryVal(de);
7438 if (minttl == -1 || t < minttl) {
7439 minkey = dictGetEntryKey(de);
7440 minttl = t;
3fd78bcd 7441 }
3fd78bcd 7442 }
a5819310 7443 deleteKey(server.db+j,minkey);
3fd78bcd 7444 }
3fd78bcd 7445 }
a5819310 7446 if (!freed) return; /* nothing to free... */
3fd78bcd 7447 }
7448}
7449
f80dff62 7450/* ============================== Append Only file ========================== */
7451
7452static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7453 sds buf = sdsempty();
7454 int j;
7455 ssize_t nwritten;
7456 time_t now;
7457 robj *tmpargv[3];
7458
7459 /* The DB this command was targetting is not the same as the last command
7460 * we appendend. To issue a SELECT command is needed. */
7461 if (dictid != server.appendseldb) {
7462 char seldb[64];
7463
7464 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 7465 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 7466 (unsigned long)strlen(seldb),seldb);
f80dff62 7467 server.appendseldb = dictid;
7468 }
7469
7470 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7471 * EXPIREs into EXPIREATs calls */
7472 if (cmd->proc == expireCommand) {
7473 long when;
7474
7475 tmpargv[0] = createStringObject("EXPIREAT",8);
7476 tmpargv[1] = argv[1];
7477 incrRefCount(argv[1]);
7478 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7479 tmpargv[2] = createObject(REDIS_STRING,
7480 sdscatprintf(sdsempty(),"%ld",when));
7481 argv = tmpargv;
7482 }
7483
7484 /* Append the actual command */
7485 buf = sdscatprintf(buf,"*%d\r\n",argc);
7486 for (j = 0; j < argc; j++) {
7487 robj *o = argv[j];
7488
9d65a1bb 7489 o = getDecodedObject(o);
83c6a618 7490 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 7491 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7492 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 7493 decrRefCount(o);
f80dff62 7494 }
7495
7496 /* Free the objects from the modified argv for EXPIREAT */
7497 if (cmd->proc == expireCommand) {
7498 for (j = 0; j < 3; j++)
7499 decrRefCount(argv[j]);
7500 }
7501
7502 /* We want to perform a single write. This should be guaranteed atomic
7503 * at least if the filesystem we are writing is a real physical one.
7504 * While this will save us against the server being killed I don't think
7505 * there is much to do about the whole server stopping for power problems
7506 * or alike */
7507 nwritten = write(server.appendfd,buf,sdslen(buf));
7508 if (nwritten != (signed)sdslen(buf)) {
7509 /* Ooops, we are in troubles. The best thing to do for now is
7510 * to simply exit instead to give the illusion that everything is
7511 * working as expected. */
7512 if (nwritten == -1) {
7513 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7514 } else {
7515 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7516 }
7517 exit(1);
7518 }
85a83172 7519 /* If a background append only file rewriting is in progress we want to
7520 * accumulate the differences between the child DB and the current one
7521 * in a buffer, so that when the child process will do its work we
7522 * can append the differences to the new append only file. */
7523 if (server.bgrewritechildpid != -1)
7524 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7525
7526 sdsfree(buf);
f80dff62 7527 now = time(NULL);
7528 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7529 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7530 now-server.lastfsync > 1))
7531 {
7532 fsync(server.appendfd); /* Let's try to get this data on the disk */
7533 server.lastfsync = now;
7534 }
7535}
7536
7537/* In Redis commands are always executed in the context of a client, so in
7538 * order to load the append only file we need to create a fake client. */
7539static struct redisClient *createFakeClient(void) {
7540 struct redisClient *c = zmalloc(sizeof(*c));
7541
7542 selectDb(c,0);
7543 c->fd = -1;
7544 c->querybuf = sdsempty();
7545 c->argc = 0;
7546 c->argv = NULL;
7547 c->flags = 0;
9387d17d 7548 /* We set the fake client as a slave waiting for the synchronization
7549 * so that Redis will not try to send replies to this client. */
7550 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 7551 c->reply = listCreate();
7552 listSetFreeMethod(c->reply,decrRefCount);
7553 listSetDupMethod(c->reply,dupClientReplyValue);
7554 return c;
7555}
7556
7557static void freeFakeClient(struct redisClient *c) {
7558 sdsfree(c->querybuf);
7559 listRelease(c->reply);
7560 zfree(c);
7561}
7562
7563/* Replay the append log file. On error REDIS_OK is returned. On non fatal
7564 * error (the append only file is zero-length) REDIS_ERR is returned. On
7565 * fatal error an error message is logged and the program exists. */
7566int loadAppendOnlyFile(char *filename) {
7567 struct redisClient *fakeClient;
7568 FILE *fp = fopen(filename,"r");
7569 struct redis_stat sb;
b492cf00 7570 unsigned long long loadedkeys = 0;
f80dff62 7571
7572 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7573 return REDIS_ERR;
7574
7575 if (fp == NULL) {
7576 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7577 exit(1);
7578 }
7579
7580 fakeClient = createFakeClient();
7581 while(1) {
7582 int argc, j;
7583 unsigned long len;
7584 robj **argv;
7585 char buf[128];
7586 sds argsds;
7587 struct redisCommand *cmd;
7588
7589 if (fgets(buf,sizeof(buf),fp) == NULL) {
7590 if (feof(fp))
7591 break;
7592 else
7593 goto readerr;
7594 }
7595 if (buf[0] != '*') goto fmterr;
7596 argc = atoi(buf+1);
7597 argv = zmalloc(sizeof(robj*)*argc);
7598 for (j = 0; j < argc; j++) {
7599 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7600 if (buf[0] != '$') goto fmterr;
7601 len = strtol(buf+1,NULL,10);
7602 argsds = sdsnewlen(NULL,len);
0f151ef1 7603 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 7604 argv[j] = createObject(REDIS_STRING,argsds);
7605 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7606 }
7607
7608 /* Command lookup */
7609 cmd = lookupCommand(argv[0]->ptr);
7610 if (!cmd) {
7611 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7612 exit(1);
7613 }
7614 /* Try object sharing and encoding */
7615 if (server.shareobjects) {
7616 int j;
7617 for(j = 1; j < argc; j++)
7618 argv[j] = tryObjectSharing(argv[j]);
7619 }
7620 if (cmd->flags & REDIS_CMD_BULK)
7621 tryObjectEncoding(argv[argc-1]);
7622 /* Run the command in the context of a fake client */
7623 fakeClient->argc = argc;
7624 fakeClient->argv = argv;
7625 cmd->proc(fakeClient);
7626 /* Discard the reply objects list from the fake client */
7627 while(listLength(fakeClient->reply))
7628 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7629 /* Clean up, ready for the next command */
7630 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7631 zfree(argv);
b492cf00 7632 /* Handle swapping while loading big datasets when VM is on */
7633 loadedkeys++;
7634 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7635 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 7636 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 7637 }
7638 }
f80dff62 7639 }
7640 fclose(fp);
7641 freeFakeClient(fakeClient);
7642 return REDIS_OK;
7643
7644readerr:
7645 if (feof(fp)) {
7646 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7647 } else {
7648 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7649 }
7650 exit(1);
7651fmterr:
7652 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7653 exit(1);
7654}
7655
9d65a1bb 7656/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 7657static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 7658 char buf[128];
b9bc0eef 7659 int decrrc = 0;
7660
f2d9f50f 7661 /* Avoid the incr/decr ref count business if possible to help
7662 * copy-on-write (we are often in a child process when this function
7663 * is called).
7664 * Also makes sure that key objects don't get incrRefCount-ed when VM
7665 * is enabled */
7666 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 7667 obj = getDecodedObject(obj);
7668 decrrc = 1;
7669 }
9d65a1bb 7670 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7671 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 7672 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7673 goto err;
9d65a1bb 7674 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 7675 if (decrrc) decrRefCount(obj);
9d65a1bb 7676 return 1;
7677err:
b9bc0eef 7678 if (decrrc) decrRefCount(obj);
9d65a1bb 7679 return 0;
7680}
7681
9c8e3cee 7682/* Write binary-safe string into a file in the bulkformat
7683 * $<count>\r\n<payload>\r\n */
7684static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7685 char buf[128];
7686
7687 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7688 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7689 if (len && fwrite(s,len,1,fp) == 0) return 0;
7690 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7691 return 1;
7692}
7693
9d65a1bb 7694/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7695static int fwriteBulkDouble(FILE *fp, double d) {
7696 char buf[128], dbuf[128];
7697
7698 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7699 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7700 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7701 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7702 return 1;
7703}
7704
7705/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7706static int fwriteBulkLong(FILE *fp, long l) {
7707 char buf[128], lbuf[128];
7708
7709 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7710 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7711 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7712 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7713 return 1;
7714}
7715
7716/* Write a sequence of commands able to fully rebuild the dataset into
7717 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7718static int rewriteAppendOnlyFile(char *filename) {
7719 dictIterator *di = NULL;
7720 dictEntry *de;
7721 FILE *fp;
7722 char tmpfile[256];
7723 int j;
7724 time_t now = time(NULL);
7725
7726 /* Note that we have to use a different temp name here compared to the
7727 * one used by rewriteAppendOnlyFileBackground() function. */
7728 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7729 fp = fopen(tmpfile,"w");
7730 if (!fp) {
7731 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7732 return REDIS_ERR;
7733 }
7734 for (j = 0; j < server.dbnum; j++) {
7735 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7736 redisDb *db = server.db+j;
7737 dict *d = db->dict;
7738 if (dictSize(d) == 0) continue;
7739 di = dictGetIterator(d);
7740 if (!di) {
7741 fclose(fp);
7742 return REDIS_ERR;
7743 }
7744
7745 /* SELECT the new DB */
7746 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 7747 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 7748
7749 /* Iterate this DB writing every entry */
7750 while((de = dictNext(di)) != NULL) {
e7546c63 7751 robj *key, *o;
7752 time_t expiretime;
7753 int swapped;
7754
7755 key = dictGetEntryKey(de);
b9bc0eef 7756 /* If the value for this key is swapped, load a preview in memory.
7757 * We use a "swapped" flag to remember if we need to free the
7758 * value object instead to just increment the ref count anyway
7759 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 7760 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7761 key->storage == REDIS_VM_SWAPPING) {
e7546c63 7762 o = dictGetEntryVal(de);
7763 swapped = 0;
7764 } else {
7765 o = vmPreviewObject(key);
e7546c63 7766 swapped = 1;
7767 }
7768 expiretime = getExpire(db,key);
9d65a1bb 7769
7770 /* Save the key and associated value */
9d65a1bb 7771 if (o->type == REDIS_STRING) {
7772 /* Emit a SET command */
7773 char cmd[]="*3\r\n$3\r\nSET\r\n";
7774 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7775 /* Key and value */
9c8e3cee 7776 if (fwriteBulkObject(fp,key) == 0) goto werr;
7777 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 7778 } else if (o->type == REDIS_LIST) {
7779 /* Emit the RPUSHes needed to rebuild the list */
7780 list *list = o->ptr;
7781 listNode *ln;
c7df85a4 7782 listIter li;
9d65a1bb 7783
c7df85a4 7784 listRewind(list,&li);
7785 while((ln = listNext(&li))) {
9d65a1bb 7786 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7787 robj *eleobj = listNodeValue(ln);
7788
7789 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7790 if (fwriteBulkObject(fp,key) == 0) goto werr;
7791 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7792 }
7793 } else if (o->type == REDIS_SET) {
7794 /* Emit the SADDs needed to rebuild the set */
7795 dict *set = o->ptr;
7796 dictIterator *di = dictGetIterator(set);
7797 dictEntry *de;
7798
7799 while((de = dictNext(di)) != NULL) {
7800 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7801 robj *eleobj = dictGetEntryKey(de);
7802
7803 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7804 if (fwriteBulkObject(fp,key) == 0) goto werr;
7805 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7806 }
7807 dictReleaseIterator(di);
7808 } else if (o->type == REDIS_ZSET) {
7809 /* Emit the ZADDs needed to rebuild the sorted set */
7810 zset *zs = o->ptr;
7811 dictIterator *di = dictGetIterator(zs->dict);
7812 dictEntry *de;
7813
7814 while((de = dictNext(di)) != NULL) {
7815 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7816 robj *eleobj = dictGetEntryKey(de);
7817 double *score = dictGetEntryVal(de);
7818
7819 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7820 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 7821 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 7822 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7823 }
7824 dictReleaseIterator(di);
9c8e3cee 7825 } else if (o->type == REDIS_HASH) {
7826 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7827
7828 /* Emit the HSETs needed to rebuild the hash */
7829 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7830 unsigned char *p = zipmapRewind(o->ptr);
7831 unsigned char *field, *val;
7832 unsigned int flen, vlen;
7833
7834 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7835 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7836 if (fwriteBulkObject(fp,key) == 0) goto werr;
7837 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7838 return -1;
7839 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7840 return -1;
7841 }
7842 } else {
7843 dictIterator *di = dictGetIterator(o->ptr);
7844 dictEntry *de;
7845
7846 while((de = dictNext(di)) != NULL) {
7847 robj *field = dictGetEntryKey(de);
7848 robj *val = dictGetEntryVal(de);
7849
7850 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7851 if (fwriteBulkObject(fp,key) == 0) goto werr;
7852 if (fwriteBulkObject(fp,field) == -1) return -1;
7853 if (fwriteBulkObject(fp,val) == -1) return -1;
7854 }
7855 dictReleaseIterator(di);
7856 }
9d65a1bb 7857 } else {
dfc5e96c 7858 redisAssert(0 != 0);
9d65a1bb 7859 }
7860 /* Save the expire time */
7861 if (expiretime != -1) {
e96e4fbf 7862 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 7863 /* If this key is already expired skip it */
7864 if (expiretime < now) continue;
7865 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7866 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 7867 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7868 }
b9bc0eef 7869 if (swapped) decrRefCount(o);
9d65a1bb 7870 }
7871 dictReleaseIterator(di);
7872 }
7873
7874 /* Make sure data will not remain on the OS's output buffers */
7875 fflush(fp);
7876 fsync(fileno(fp));
7877 fclose(fp);
7878
7879 /* Use RENAME to make sure the DB file is changed atomically only
7880 * if the generate DB file is ok. */
7881 if (rename(tmpfile,filename) == -1) {
7882 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7883 unlink(tmpfile);
7884 return REDIS_ERR;
7885 }
7886 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7887 return REDIS_OK;
7888
7889werr:
7890 fclose(fp);
7891 unlink(tmpfile);
e96e4fbf 7892 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 7893 if (di) dictReleaseIterator(di);
7894 return REDIS_ERR;
7895}
7896
7897/* This is how rewriting of the append only file in background works:
7898 *
7899 * 1) The user calls BGREWRITEAOF
7900 * 2) Redis calls this function, that forks():
7901 * 2a) the child rewrite the append only file in a temp file.
7902 * 2b) the parent accumulates differences in server.bgrewritebuf.
7903 * 3) When the child finished '2a' exists.
7904 * 4) The parent will trap the exit code, if it's OK, will append the
7905 * data accumulated into server.bgrewritebuf into the temp file, and
7906 * finally will rename(2) the temp file in the actual file name.
7907 * The the new file is reopened as the new append only file. Profit!
7908 */
7909static int rewriteAppendOnlyFileBackground(void) {
7910 pid_t childpid;
7911
7912 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 7913 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 7914 if ((childpid = fork()) == 0) {
7915 /* Child */
7916 char tmpfile[256];
9d65a1bb 7917
054e426d 7918 if (server.vm_enabled) vmReopenSwapFile();
7919 close(server.fd);
9d65a1bb 7920 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7921 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 7922 _exit(0);
9d65a1bb 7923 } else {
478c2c6f 7924 _exit(1);
9d65a1bb 7925 }
7926 } else {
7927 /* Parent */
7928 if (childpid == -1) {
7929 redisLog(REDIS_WARNING,
7930 "Can't rewrite append only file in background: fork: %s",
7931 strerror(errno));
7932 return REDIS_ERR;
7933 }
7934 redisLog(REDIS_NOTICE,
7935 "Background append only file rewriting started by pid %d",childpid);
7936 server.bgrewritechildpid = childpid;
85a83172 7937 /* We set appendseldb to -1 in order to force the next call to the
7938 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7939 * accumulated by the parent into server.bgrewritebuf will start
7940 * with a SELECT statement and it will be safe to merge. */
7941 server.appendseldb = -1;
9d65a1bb 7942 return REDIS_OK;
7943 }
7944 return REDIS_OK; /* unreached */
7945}
7946
7947static void bgrewriteaofCommand(redisClient *c) {
7948 if (server.bgrewritechildpid != -1) {
7949 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7950 return;
7951 }
7952 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 7953 char *status = "+Background append only file rewriting started\r\n";
7954 addReplySds(c,sdsnew(status));
9d65a1bb 7955 } else {
7956 addReply(c,shared.err);
7957 }
7958}
7959
7960static void aofRemoveTempFile(pid_t childpid) {
7961 char tmpfile[256];
7962
7963 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7964 unlink(tmpfile);
7965}
7966
996cb5f7 7967/* Virtual Memory is composed mainly of two subsystems:
7968 * - Blocking Virutal Memory
7969 * - Threaded Virtual Memory I/O
7970 * The two parts are not fully decoupled, but functions are split among two
7971 * different sections of the source code (delimited by comments) in order to
7972 * make more clear what functionality is about the blocking VM and what about
7973 * the threaded (not blocking) VM.
7974 *
7975 * Redis VM design:
7976 *
7977 * Redis VM is a blocking VM (one that blocks reading swapped values from
7978 * disk into memory when a value swapped out is needed in memory) that is made
7979 * unblocking by trying to examine the command argument vector in order to
7980 * load in background values that will likely be needed in order to exec
7981 * the command. The command is executed only once all the relevant keys
7982 * are loaded into memory.
7983 *
7984 * This basically is almost as simple of a blocking VM, but almost as parallel
7985 * as a fully non-blocking VM.
7986 */
7987
7988/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 7989
7990/* substitute the first occurrence of '%p' with the process pid in the
7991 * swap file name. */
7992static void expandVmSwapFilename(void) {
7993 char *p = strstr(server.vm_swap_file,"%p");
7994 sds new;
7995
7996 if (!p) return;
7997 new = sdsempty();
7998 *p = '\0';
7999 new = sdscat(new,server.vm_swap_file);
8000 new = sdscatprintf(new,"%ld",(long) getpid());
8001 new = sdscat(new,p+2);
8002 zfree(server.vm_swap_file);
8003 server.vm_swap_file = new;
8004}
8005
75680a3c 8006static void vmInit(void) {
8007 off_t totsize;
996cb5f7 8008 int pipefds[2];
bcaa7a4f 8009 size_t stacksize;
75680a3c 8010
4ad37480 8011 if (server.vm_max_threads != 0)
8012 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8013
054e426d 8014 expandVmSwapFilename();
8015 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 8016 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8017 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8018 }
75680a3c 8019 if (server.vm_fp == NULL) {
6fa987e3 8020 redisLog(REDIS_WARNING,
8021 "Impossible to open the swap file: %s. Exiting.",
8022 strerror(errno));
75680a3c 8023 exit(1);
8024 }
8025 server.vm_fd = fileno(server.vm_fp);
8026 server.vm_next_page = 0;
8027 server.vm_near_pages = 0;
7d98e08c 8028 server.vm_stats_used_pages = 0;
8029 server.vm_stats_swapped_objects = 0;
8030 server.vm_stats_swapouts = 0;
8031 server.vm_stats_swapins = 0;
75680a3c 8032 totsize = server.vm_pages*server.vm_page_size;
8033 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8034 if (ftruncate(server.vm_fd,totsize) == -1) {
8035 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8036 strerror(errno));
8037 exit(1);
8038 } else {
8039 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8040 }
7d30035d 8041 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8042 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8043 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8044 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8045
996cb5f7 8046 /* Initialize threaded I/O (used by Virtual Memory) */
8047 server.io_newjobs = listCreate();
8048 server.io_processing = listCreate();
8049 server.io_processed = listCreate();
d5d55fc3 8050 server.io_ready_clients = listCreate();
92f8e882 8051 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8052 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8053 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8054 server.io_active_threads = 0;
996cb5f7 8055 if (pipe(pipefds) == -1) {
8056 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8057 ,strerror(errno));
8058 exit(1);
8059 }
8060 server.io_ready_pipe_read = pipefds[0];
8061 server.io_ready_pipe_write = pipefds[1];
8062 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8063 /* LZF requires a lot of stack */
8064 pthread_attr_init(&server.io_threads_attr);
8065 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8066 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8067 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8068 /* Listen for events in the threaded I/O pipe */
8069 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8070 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8071 oom("creating file event");
75680a3c 8072}
8073
06224fec 8074/* Mark the page as used */
8075static void vmMarkPageUsed(off_t page) {
8076 off_t byte = page/8;
8077 int bit = page&7;
970e10bb 8078 redisAssert(vmFreePage(page) == 1);
06224fec 8079 server.vm_bitmap[byte] |= 1<<bit;
8080}
8081
8082/* Mark N contiguous pages as used, with 'page' being the first. */
8083static void vmMarkPagesUsed(off_t page, off_t count) {
8084 off_t j;
8085
8086 for (j = 0; j < count; j++)
7d30035d 8087 vmMarkPageUsed(page+j);
7d98e08c 8088 server.vm_stats_used_pages += count;
7c775e09 8089 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8090 (long long)count, (long long)page);
06224fec 8091}
8092
8093/* Mark the page as free */
8094static void vmMarkPageFree(off_t page) {
8095 off_t byte = page/8;
8096 int bit = page&7;
970e10bb 8097 redisAssert(vmFreePage(page) == 0);
06224fec 8098 server.vm_bitmap[byte] &= ~(1<<bit);
8099}
8100
8101/* Mark N contiguous pages as free, with 'page' being the first. */
8102static void vmMarkPagesFree(off_t page, off_t count) {
8103 off_t j;
8104
8105 for (j = 0; j < count; j++)
7d30035d 8106 vmMarkPageFree(page+j);
7d98e08c 8107 server.vm_stats_used_pages -= count;
7c775e09 8108 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8109 (long long)count, (long long)page);
06224fec 8110}
8111
8112/* Test if the page is free */
8113static int vmFreePage(off_t page) {
8114 off_t byte = page/8;
8115 int bit = page&7;
7d30035d 8116 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8117}
8118
8119/* Find N contiguous free pages storing the first page of the cluster in *first.
3a66edc7 8120 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8121 * REDIS_ERR is returned.
06224fec 8122 *
8123 * This function uses a simple algorithm: we try to allocate
8124 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8125 * again from the start of the swap file searching for free spaces.
8126 *
8127 * If it looks pretty clear that there are no free pages near our offset
8128 * we try to find less populated places doing a forward jump of
8129 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8130 * without hurry, and then we jump again and so forth...
8131 *
8132 * This function can be improved using a free list to avoid to guess
8133 * too much, since we could collect data about freed pages.
8134 *
8135 * note: I implemented this function just after watching an episode of
8136 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8137 */
c7df85a4 8138static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8139 off_t base, offset = 0, since_jump = 0, numfree = 0;
8140
8141 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8142 server.vm_near_pages = 0;
8143 server.vm_next_page = 0;
8144 }
8145 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8146 base = server.vm_next_page;
8147
8148 while(offset < server.vm_pages) {
8149 off_t this = base+offset;
8150
8151 /* If we overflow, restart from page zero */
8152 if (this >= server.vm_pages) {
8153 this -= server.vm_pages;
8154 if (this == 0) {
8155 /* Just overflowed, what we found on tail is no longer
8156 * interesting, as it's no longer contiguous. */
8157 numfree = 0;
8158 }
8159 }
8160 if (vmFreePage(this)) {
8161 /* This is a free page */
8162 numfree++;
8163 /* Already got N free pages? Return to the caller, with success */
8164 if (numfree == n) {
7d30035d 8165 *first = this-(n-1);
8166 server.vm_next_page = this+1;
7c775e09 8167 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8168 return REDIS_OK;
06224fec 8169 }
8170 } else {
8171 /* The current one is not a free page */
8172 numfree = 0;
8173 }
8174
8175 /* Fast-forward if the current page is not free and we already
8176 * searched enough near this place. */
8177 since_jump++;
8178 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8179 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8180 since_jump = 0;
8181 /* Note that even if we rewind after the jump, we are don't need
8182 * to make sure numfree is set to zero as we only jump *if* it
8183 * is set to zero. */
8184 } else {
8185 /* Otherwise just check the next page */
8186 offset++;
8187 }
8188 }
3a66edc7 8189 return REDIS_ERR;
8190}
8191
a5819310 8192/* Write the specified object at the specified page of the swap file */
8193static int vmWriteObjectOnSwap(robj *o, off_t page) {
8194 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8195 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8196 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8197 redisLog(REDIS_WARNING,
9ebed7cf 8198 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8199 strerror(errno));
8200 return REDIS_ERR;
8201 }
8202 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8203 fflush(server.vm_fp);
a5819310 8204 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8205 return REDIS_OK;
8206}
8207
3a66edc7 8208/* Swap the 'val' object relative to 'key' into disk. Store all the information
8209 * needed to later retrieve the object into the key object.
8210 * If we can't find enough contiguous empty pages to swap the object on disk
8211 * REDIS_ERR is returned. */
a69a0c9c 8212static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8213 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8214 off_t page;
8215
8216 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8217 assert(key->refcount == 1);
3a66edc7 8218 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8219 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8220 key->vm.page = page;
8221 key->vm.usedpages = pages;
8222 key->storage = REDIS_VM_SWAPPED;
d894161b 8223 key->vtype = val->type;
3a66edc7 8224 decrRefCount(val); /* Deallocate the object from memory. */
8225 vmMarkPagesUsed(page,pages);
7d30035d 8226 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8227 (unsigned char*) key->ptr,
8228 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8229 server.vm_stats_swapped_objects++;
8230 server.vm_stats_swapouts++;
3a66edc7 8231 return REDIS_OK;
8232}
8233
a5819310 8234static robj *vmReadObjectFromSwap(off_t page, int type) {
8235 robj *o;
3a66edc7 8236
a5819310 8237 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8238 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8239 redisLog(REDIS_WARNING,
d5d55fc3 8240 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8241 strerror(errno));
478c2c6f 8242 _exit(1);
3a66edc7 8243 }
a5819310 8244 o = rdbLoadObject(type,server.vm_fp);
8245 if (o == NULL) {
d5d55fc3 8246 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8247 _exit(1);
3a66edc7 8248 }
a5819310 8249 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8250 return o;
8251}
8252
8253/* Load the value object relative to the 'key' object from swap to memory.
8254 * The newly allocated object is returned.
8255 *
8256 * If preview is true the unserialized object is returned to the caller but
8257 * no changes are made to the key object, nor the pages are marked as freed */
8258static robj *vmGenericLoadObject(robj *key, int preview) {
8259 robj *val;
8260
d5d55fc3 8261 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8262 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8263 if (!preview) {
8264 key->storage = REDIS_VM_MEMORY;
8265 key->vm.atime = server.unixtime;
8266 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8267 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8268 (unsigned char*) key->ptr);
7d98e08c 8269 server.vm_stats_swapped_objects--;
38aba9a1 8270 } else {
8271 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8272 (unsigned char*) key->ptr);
7e69548d 8273 }
7d98e08c 8274 server.vm_stats_swapins++;
3a66edc7 8275 return val;
06224fec 8276}
8277
7e69548d 8278/* Plain object loading, from swap to memory */
8279static robj *vmLoadObject(robj *key) {
996cb5f7 8280 /* If we are loading the object in background, stop it, we
8281 * need to load this object synchronously ASAP. */
8282 if (key->storage == REDIS_VM_LOADING)
8283 vmCancelThreadedIOJob(key);
7e69548d 8284 return vmGenericLoadObject(key,0);
8285}
8286
8287/* Just load the value on disk, without to modify the key.
8288 * This is useful when we want to perform some operation on the value
8289 * without to really bring it from swap to memory, like while saving the
8290 * dataset or rewriting the append only log. */
8291static robj *vmPreviewObject(robj *key) {
8292 return vmGenericLoadObject(key,1);
8293}
8294
4ef8de8a 8295/* How a good candidate is this object for swapping?
8296 * The better candidate it is, the greater the returned value.
8297 *
8298 * Currently we try to perform a fast estimation of the object size in
8299 * memory, and combine it with aging informations.
8300 *
8301 * Basically swappability = idle-time * log(estimated size)
8302 *
8303 * Bigger objects are preferred over smaller objects, but not
8304 * proportionally, this is why we use the logarithm. This algorithm is
8305 * just a first try and will probably be tuned later. */
8306static double computeObjectSwappability(robj *o) {
8307 time_t age = server.unixtime - o->vm.atime;
8308 long asize = 0;
8309 list *l;
8310 dict *d;
8311 struct dictEntry *de;
8312 int z;
8313
8314 if (age <= 0) return 0;
8315 switch(o->type) {
8316 case REDIS_STRING:
8317 if (o->encoding != REDIS_ENCODING_RAW) {
8318 asize = sizeof(*o);
8319 } else {
8320 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8321 }
8322 break;
8323 case REDIS_LIST:
8324 l = o->ptr;
8325 listNode *ln = listFirst(l);
8326
8327 asize = sizeof(list);
8328 if (ln) {
8329 robj *ele = ln->value;
8330 long elesize;
8331
8332 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8333 (sizeof(*o)+sdslen(ele->ptr)) :
8334 sizeof(*o);
8335 asize += (sizeof(listNode)+elesize)*listLength(l);
8336 }
8337 break;
8338 case REDIS_SET:
8339 case REDIS_ZSET:
8340 z = (o->type == REDIS_ZSET);
8341 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8342
8343 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8344 if (z) asize += sizeof(zset)-sizeof(dict);
8345 if (dictSize(d)) {
8346 long elesize;
8347 robj *ele;
8348
8349 de = dictGetRandomKey(d);
8350 ele = dictGetEntryKey(de);
8351 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8352 (sizeof(*o)+sdslen(ele->ptr)) :
8353 sizeof(*o);
8354 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8355 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8356 }
8357 break;
8358 }
c8c72447 8359 return (double)age*log(1+asize);
4ef8de8a 8360}
8361
8362/* Try to swap an object that's a good candidate for swapping.
8363 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 8364 * to swap any object at all.
8365 *
8366 * If 'usethreaded' is true, Redis will try to swap the object in background
8367 * using I/O threads. */
8368static int vmSwapOneObject(int usethreads) {
4ef8de8a 8369 int j, i;
8370 struct dictEntry *best = NULL;
8371 double best_swappability = 0;
b9bc0eef 8372 redisDb *best_db = NULL;
4ef8de8a 8373 robj *key, *val;
8374
8375 for (j = 0; j < server.dbnum; j++) {
8376 redisDb *db = server.db+j;
b72f6a4b 8377 /* Why maxtries is set to 100?
8378 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8379 * are swappable objects */
b0d8747d 8380 int maxtries = 100;
4ef8de8a 8381
8382 if (dictSize(db->dict) == 0) continue;
8383 for (i = 0; i < 5; i++) {
8384 dictEntry *de;
8385 double swappability;
8386
e3cadb8a 8387 if (maxtries) maxtries--;
4ef8de8a 8388 de = dictGetRandomKey(db->dict);
8389 key = dictGetEntryKey(de);
8390 val = dictGetEntryVal(de);
1064ef87 8391 /* Only swap objects that are currently in memory.
8392 *
8393 * Also don't swap shared objects if threaded VM is on, as we
8394 * try to ensure that the main thread does not touch the
8395 * object while the I/O thread is using it, but we can't
8396 * control other keys without adding additional mutex. */
8397 if (key->storage != REDIS_VM_MEMORY ||
8398 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 8399 if (maxtries) i--; /* don't count this try */
8400 continue;
8401 }
4ef8de8a 8402 swappability = computeObjectSwappability(val);
8403 if (!best || swappability > best_swappability) {
8404 best = de;
8405 best_swappability = swappability;
b9bc0eef 8406 best_db = db;
4ef8de8a 8407 }
8408 }
8409 }
7c775e09 8410 if (best == NULL) return REDIS_ERR;
4ef8de8a 8411 key = dictGetEntryKey(best);
8412 val = dictGetEntryVal(best);
8413
e3cadb8a 8414 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 8415 key->ptr, best_swappability);
8416
8417 /* Unshare the key if needed */
8418 if (key->refcount > 1) {
8419 robj *newkey = dupStringObject(key);
8420 decrRefCount(key);
8421 key = dictGetEntryKey(best) = newkey;
8422 }
8423 /* Swap it */
a69a0c9c 8424 if (usethreads) {
b9bc0eef 8425 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 8426 return REDIS_OK;
8427 } else {
a69a0c9c 8428 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8429 dictGetEntryVal(best) = NULL;
8430 return REDIS_OK;
8431 } else {
8432 return REDIS_ERR;
8433 }
4ef8de8a 8434 }
8435}
8436
a69a0c9c 8437static int vmSwapOneObjectBlocking() {
8438 return vmSwapOneObject(0);
8439}
8440
8441static int vmSwapOneObjectThreaded() {
8442 return vmSwapOneObject(1);
8443}
8444
7e69548d 8445/* Return true if it's safe to swap out objects in a given moment.
8446 * Basically we don't want to swap objects out while there is a BGSAVE
8447 * or a BGAEOREWRITE running in backgroud. */
8448static int vmCanSwapOut(void) {
8449 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8450}
8451
1b03836c 8452/* Delete a key if swapped. Returns 1 if the key was found, was swapped
8453 * and was deleted. Otherwise 0 is returned. */
8454static int deleteIfSwapped(redisDb *db, robj *key) {
8455 dictEntry *de;
8456 robj *foundkey;
8457
8458 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8459 foundkey = dictGetEntryKey(de);
8460 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8461 deleteKey(db,key);
8462 return 1;
8463}
8464
996cb5f7 8465/* =================== Virtual Memory - Threaded I/O ======================= */
8466
b9bc0eef 8467static void freeIOJob(iojob *j) {
d5d55fc3 8468 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8469 j->type == REDIS_IOJOB_DO_SWAP ||
8470 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 8471 decrRefCount(j->val);
8472 decrRefCount(j->key);
8473 zfree(j);
8474}
8475
996cb5f7 8476/* Every time a thread finished a Job, it writes a byte into the write side
8477 * of an unix pipe in order to "awake" the main thread, and this function
8478 * is called. */
8479static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8480 int mask)
8481{
8482 char buf[1];
b0d8747d 8483 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 8484 REDIS_NOTUSED(el);
8485 REDIS_NOTUSED(mask);
8486 REDIS_NOTUSED(privdata);
8487
8488 /* For every byte we read in the read side of the pipe, there is one
8489 * I/O job completed to process. */
8490 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 8491 iojob *j;
8492 listNode *ln;
8493 robj *key;
8494 struct dictEntry *de;
8495
996cb5f7 8496 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 8497
8498 /* Get the processed element (the oldest one) */
8499 lockThreadedIO();
1064ef87 8500 assert(listLength(server.io_processed) != 0);
f6c0bba8 8501 if (toprocess == -1) {
8502 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8503 if (toprocess <= 0) toprocess = 1;
8504 }
b9bc0eef 8505 ln = listFirst(server.io_processed);
8506 j = ln->value;
8507 listDelNode(server.io_processed,ln);
8508 unlockThreadedIO();
8509 /* If this job is marked as canceled, just ignore it */
8510 if (j->canceled) {
8511 freeIOJob(j);
8512 continue;
8513 }
8514 /* Post process it in the main thread, as there are things we
8515 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 8516 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 8517 de = dictFind(j->db->dict,j->key);
8518 assert(de != NULL);
8519 key = dictGetEntryKey(de);
8520 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8521 redisDb *db;
8522
b9bc0eef 8523 /* Key loaded, bring it at home */
8524 key->storage = REDIS_VM_MEMORY;
8525 key->vm.atime = server.unixtime;
8526 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8527 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8528 (unsigned char*) key->ptr);
8529 server.vm_stats_swapped_objects--;
8530 server.vm_stats_swapins++;
d5d55fc3 8531 dictGetEntryVal(de) = j->val;
8532 incrRefCount(j->val);
8533 db = j->db;
b9bc0eef 8534 freeIOJob(j);
d5d55fc3 8535 /* Handle clients waiting for this key to be loaded. */
8536 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 8537 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8538 /* Now we know the amount of pages required to swap this object.
8539 * Let's find some space for it, and queue this task again
8540 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 8541 if (!vmCanSwapOut() ||
8542 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8543 {
8544 /* Ooops... no space or we can't swap as there is
8545 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 8546 freeIOJob(j);
054e426d 8547 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 8548 } else {
c7df85a4 8549 /* Note that we need to mark this pages as used now,
8550 * if the job will be canceled, we'll mark them as freed
8551 * again. */
8552 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 8553 j->type = REDIS_IOJOB_DO_SWAP;
8554 lockThreadedIO();
8555 queueIOJob(j);
8556 unlockThreadedIO();
8557 }
8558 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8559 robj *val;
8560
8561 /* Key swapped. We can finally free some memory. */
6c96ba7d 8562 if (key->storage != REDIS_VM_SWAPPING) {
8563 printf("key->storage: %d\n",key->storage);
8564 printf("key->name: %s\n",(char*)key->ptr);
8565 printf("key->refcount: %d\n",key->refcount);
8566 printf("val: %p\n",(void*)j->val);
8567 printf("val->type: %d\n",j->val->type);
8568 printf("val->ptr: %s\n",(char*)j->val->ptr);
8569 }
8570 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 8571 val = dictGetEntryVal(de);
8572 key->vm.page = j->page;
8573 key->vm.usedpages = j->pages;
8574 key->storage = REDIS_VM_SWAPPED;
8575 key->vtype = j->val->type;
8576 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 8577 dictGetEntryVal(de) = NULL;
b9bc0eef 8578 redisLog(REDIS_DEBUG,
8579 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8580 (unsigned char*) key->ptr,
8581 (unsigned long long) j->page, (unsigned long long) j->pages);
8582 server.vm_stats_swapped_objects++;
8583 server.vm_stats_swapouts++;
8584 freeIOJob(j);
f11b8647 8585 /* Put a few more swap requests in queue if we are still
8586 * out of memory */
b0d8747d 8587 if (trytoswap && vmCanSwapOut() &&
8588 zmalloc_used_memory() > server.vm_max_memory)
8589 {
f11b8647 8590 int more = 1;
8591 while(more) {
8592 lockThreadedIO();
8593 more = listLength(server.io_newjobs) <
8594 (unsigned) server.vm_max_threads;
8595 unlockThreadedIO();
8596 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 8597 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8598 trytoswap = 0;
8599 break;
8600 }
f11b8647 8601 }
8602 }
b9bc0eef 8603 }
c953f24b 8604 processed++;
f6c0bba8 8605 if (processed == toprocess) return;
996cb5f7 8606 }
8607 if (retval < 0 && errno != EAGAIN) {
8608 redisLog(REDIS_WARNING,
8609 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8610 strerror(errno));
8611 }
8612}
8613
8614static void lockThreadedIO(void) {
8615 pthread_mutex_lock(&server.io_mutex);
8616}
8617
8618static void unlockThreadedIO(void) {
8619 pthread_mutex_unlock(&server.io_mutex);
8620}
8621
8622/* Remove the specified object from the threaded I/O queue if still not
8623 * processed, otherwise make sure to flag it as canceled. */
8624static void vmCancelThreadedIOJob(robj *o) {
8625 list *lists[3] = {
6c96ba7d 8626 server.io_newjobs, /* 0 */
8627 server.io_processing, /* 1 */
8628 server.io_processed /* 2 */
996cb5f7 8629 };
8630 int i;
8631
8632 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 8633again:
996cb5f7 8634 lockThreadedIO();
8635 /* Search for a matching key in one of the queues */
8636 for (i = 0; i < 3; i++) {
8637 listNode *ln;
c7df85a4 8638 listIter li;
996cb5f7 8639
c7df85a4 8640 listRewind(lists[i],&li);
8641 while ((ln = listNext(&li)) != NULL) {
996cb5f7 8642 iojob *job = ln->value;
8643
6c96ba7d 8644 if (job->canceled) continue; /* Skip this, already canceled. */
996cb5f7 8645 if (compareStringObjects(job->key,o) == 0) {
970e10bb 8646 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8647 (void*)job, (char*)o->ptr, job->type, i);
427a2153 8648 /* Mark the pages as free since the swap didn't happened
8649 * or happened but is now discarded. */
970e10bb 8650 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 8651 vmMarkPagesFree(job->page,job->pages);
8652 /* Cancel the job. It depends on the list the job is
8653 * living in. */
996cb5f7 8654 switch(i) {
8655 case 0: /* io_newjobs */
6c96ba7d 8656 /* If the job was yet not processed the best thing to do
996cb5f7 8657 * is to remove it from the queue at all */
6c96ba7d 8658 freeIOJob(job);
996cb5f7 8659 listDelNode(lists[i],ln);
8660 break;
8661 case 1: /* io_processing */
d5d55fc3 8662 /* Oh Shi- the thread is messing with the Job:
8663 *
8664 * Probably it's accessing the object if this is a
8665 * PREPARE_SWAP or DO_SWAP job.
8666 * If it's a LOAD job it may be reading from disk and
8667 * if we don't wait for the job to terminate before to
8668 * cancel it, maybe in a few microseconds data can be
8669 * corrupted in this pages. So the short story is:
8670 *
8671 * Better to wait for the job to move into the
8672 * next queue (processed)... */
8673
8674 /* We try again and again until the job is completed. */
8675 unlockThreadedIO();
8676 /* But let's wait some time for the I/O thread
8677 * to finish with this job. After all this condition
8678 * should be very rare. */
8679 usleep(1);
8680 goto again;
996cb5f7 8681 case 2: /* io_processed */
2e111efe 8682 /* The job was already processed, that's easy...
8683 * just mark it as canceled so that we'll ignore it
8684 * when processing completed jobs. */
996cb5f7 8685 job->canceled = 1;
8686 break;
8687 }
c7df85a4 8688 /* Finally we have to adjust the storage type of the object
8689 * in order to "UNDO" the operaiton. */
996cb5f7 8690 if (o->storage == REDIS_VM_LOADING)
8691 o->storage = REDIS_VM_SWAPPED;
8692 else if (o->storage == REDIS_VM_SWAPPING)
8693 o->storage = REDIS_VM_MEMORY;
8694 unlockThreadedIO();
8695 return;
8696 }
8697 }
8698 }
8699 unlockThreadedIO();
8700 assert(1 != 1); /* We should never reach this */
8701}
8702
b9bc0eef 8703static void *IOThreadEntryPoint(void *arg) {
8704 iojob *j;
8705 listNode *ln;
8706 REDIS_NOTUSED(arg);
8707
8708 pthread_detach(pthread_self());
8709 while(1) {
8710 /* Get a new job to process */
8711 lockThreadedIO();
8712 if (listLength(server.io_newjobs) == 0) {
8713 /* No new jobs in queue, exit. */
9ebed7cf 8714 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8715 (long) pthread_self());
b9bc0eef 8716 server.io_active_threads--;
8717 unlockThreadedIO();
8718 return NULL;
8719 }
8720 ln = listFirst(server.io_newjobs);
8721 j = ln->value;
8722 listDelNode(server.io_newjobs,ln);
8723 /* Add the job in the processing queue */
8724 j->thread = pthread_self();
8725 listAddNodeTail(server.io_processing,j);
8726 ln = listLast(server.io_processing); /* We use ln later to remove it */
8727 unlockThreadedIO();
9ebed7cf 8728 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8729 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 8730
8731 /* Process the Job */
8732 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8733 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 8734 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8735 FILE *fp = fopen("/dev/null","w+");
8736 j->pages = rdbSavedObjectPages(j->val,fp);
8737 fclose(fp);
8738 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 8739 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8740 j->canceled = 1;
b9bc0eef 8741 }
8742
8743 /* Done: insert the job into the processed queue */
9ebed7cf 8744 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8745 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 8746 lockThreadedIO();
8747 listDelNode(server.io_processing,ln);
8748 listAddNodeTail(server.io_processed,j);
8749 unlockThreadedIO();
8750
8751 /* Signal the main thread there is new stuff to process */
8752 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8753 }
8754 return NULL; /* never reached */
8755}
8756
8757static void spawnIOThread(void) {
8758 pthread_t thread;
478c2c6f 8759 sigset_t mask, omask;
b9bc0eef 8760
478c2c6f 8761 sigemptyset(&mask);
8762 sigaddset(&mask,SIGCHLD);
8763 sigaddset(&mask,SIGHUP);
8764 sigaddset(&mask,SIGPIPE);
8765 pthread_sigmask(SIG_SETMASK, &mask, &omask);
bcaa7a4f 8766 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
478c2c6f 8767 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 8768 server.io_active_threads++;
8769}
8770
4ee9488d 8771/* We need to wait for the last thread to exit before we are able to
8772 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 8773static void waitEmptyIOJobsQueue(void) {
4ee9488d 8774 while(1) {
76b7233a 8775 int io_processed_len;
8776
4ee9488d 8777 lockThreadedIO();
054e426d 8778 if (listLength(server.io_newjobs) == 0 &&
8779 listLength(server.io_processing) == 0 &&
8780 server.io_active_threads == 0)
8781 {
4ee9488d 8782 unlockThreadedIO();
8783 return;
8784 }
76b7233a 8785 /* While waiting for empty jobs queue condition we post-process some
8786 * finshed job, as I/O threads may be hanging trying to write against
8787 * the io_ready_pipe_write FD but there are so much pending jobs that
8788 * it's blocking. */
8789 io_processed_len = listLength(server.io_processed);
4ee9488d 8790 unlockThreadedIO();
76b7233a 8791 if (io_processed_len) {
8792 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8793 usleep(1000); /* 1 millisecond */
8794 } else {
8795 usleep(10000); /* 10 milliseconds */
8796 }
4ee9488d 8797 }
8798}
8799
054e426d 8800static void vmReopenSwapFile(void) {
478c2c6f 8801 /* Note: we don't close the old one as we are in the child process
8802 * and don't want to mess at all with the original file object. */
054e426d 8803 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8804 if (server.vm_fp == NULL) {
8805 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8806 server.vm_swap_file);
478c2c6f 8807 _exit(1);
054e426d 8808 }
8809 server.vm_fd = fileno(server.vm_fp);
8810}
8811
b9bc0eef 8812/* This function must be called while with threaded IO locked */
8813static void queueIOJob(iojob *j) {
6c96ba7d 8814 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8815 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 8816 listAddNodeTail(server.io_newjobs,j);
8817 if (server.io_active_threads < server.vm_max_threads)
8818 spawnIOThread();
8819}
8820
8821static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8822 iojob *j;
8823
8824 assert(key->storage == REDIS_VM_MEMORY);
8825 assert(key->refcount == 1);
8826
8827 j = zmalloc(sizeof(*j));
8828 j->type = REDIS_IOJOB_PREPARE_SWAP;
8829 j->db = db;
8830 j->key = dupStringObject(key);
8831 j->val = val;
8832 incrRefCount(val);
8833 j->canceled = 0;
8834 j->thread = (pthread_t) -1;
f11b8647 8835 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 8836
8837 lockThreadedIO();
8838 queueIOJob(j);
8839 unlockThreadedIO();
8840 return REDIS_OK;
8841}
8842
b0d8747d 8843/* ============ Virtual Memory - Blocking clients on missing keys =========== */
8844
d5d55fc3 8845/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8846 * If there is not already a job loading the key, it is craeted.
8847 * The key is added to the io_keys list in the client structure, and also
8848 * in the hash table mapping swapped keys to waiting clients, that is,
8849 * server.io_waited_keys. */
8850static int waitForSwappedKey(redisClient *c, robj *key) {
8851 struct dictEntry *de;
8852 robj *o;
8853 list *l;
8854
8855 /* If the key does not exist or is already in RAM we don't need to
8856 * block the client at all. */
8857 de = dictFind(c->db->dict,key);
8858 if (de == NULL) return 0;
8859 o = dictGetEntryKey(de);
8860 if (o->storage == REDIS_VM_MEMORY) {
8861 return 0;
8862 } else if (o->storage == REDIS_VM_SWAPPING) {
8863 /* We were swapping the key, undo it! */
8864 vmCancelThreadedIOJob(o);
8865 return 0;
8866 }
8867
8868 /* OK: the key is either swapped, or being loaded just now. */
8869
8870 /* Add the key to the list of keys this client is waiting for.
8871 * This maps clients to keys they are waiting for. */
8872 listAddNodeTail(c->io_keys,key);
8873 incrRefCount(key);
8874
8875 /* Add the client to the swapped keys => clients waiting map. */
8876 de = dictFind(c->db->io_keys,key);
8877 if (de == NULL) {
8878 int retval;
8879
8880 /* For every key we take a list of clients blocked for it */
8881 l = listCreate();
8882 retval = dictAdd(c->db->io_keys,key,l);
8883 incrRefCount(key);
8884 assert(retval == DICT_OK);
8885 } else {
8886 l = dictGetEntryVal(de);
8887 }
8888 listAddNodeTail(l,c);
8889
8890 /* Are we already loading the key from disk? If not create a job */
8891 if (o->storage == REDIS_VM_SWAPPED) {
8892 iojob *j;
8893
8894 o->storage = REDIS_VM_LOADING;
8895 j = zmalloc(sizeof(*j));
8896 j->type = REDIS_IOJOB_LOAD;
8897 j->db = c->db;
8898 j->key = dupStringObject(key);
8899 j->key->vtype = o->vtype;
8900 j->page = o->vm.page;
8901 j->val = NULL;
8902 j->canceled = 0;
8903 j->thread = (pthread_t) -1;
8904 lockThreadedIO();
8905 queueIOJob(j);
8906 unlockThreadedIO();
8907 }
8908 return 1;
8909}
8910
b0d8747d 8911/* Is this client attempting to run a command against swapped keys?
d5d55fc3 8912 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 8913 *
d5d55fc3 8914 * The important idea about this function is that it can fail! If keys will
8915 * still be swapped when the client is resumed, this key lookups will
8916 * just block loading keys from disk. In practical terms this should only
8917 * happen with SORT BY command or if there is a bug in this function.
8918 *
8919 * Return 1 if the client is marked as blocked, 0 if the client can
8920 * continue as the keys it is going to access appear to be in memory. */
8921static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 8922 int j, last;
8923
8924 if (cmd->vm_firstkey == 0) return 0;
8925 last = cmd->vm_lastkey;
8926 if (last < 0) last = c->argc+last;
8927 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8928 waitForSwappedKey(c,c->argv[j]);
d5d55fc3 8929 /* If the client was blocked for at least one key, mark it as blocked. */
8930 if (listLength(c->io_keys)) {
8931 c->flags |= REDIS_IO_WAIT;
8932 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8933 server.vm_blocked_clients++;
8934 return 1;
8935 } else {
8936 return 0;
8937 }
8938}
8939
8940/* Remove the 'key' from the list of blocked keys for a given client.
8941 *
8942 * The function returns 1 when there are no longer blocking keys after
8943 * the current one was removed (and the client can be unblocked). */
8944static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8945 list *l;
8946 listNode *ln;
8947 listIter li;
8948 struct dictEntry *de;
8949
8950 /* Remove the key from the list of keys this client is waiting for. */
8951 listRewind(c->io_keys,&li);
8952 while ((ln = listNext(&li)) != NULL) {
8953 if (compareStringObjects(ln->value,key) == 0) {
8954 listDelNode(c->io_keys,ln);
8955 break;
8956 }
8957 }
8958 assert(ln != NULL);
8959
8960 /* Remove the client form the key => waiting clients map. */
8961 de = dictFind(c->db->io_keys,key);
8962 assert(de != NULL);
8963 l = dictGetEntryVal(de);
8964 ln = listSearchKey(l,c);
8965 assert(ln != NULL);
8966 listDelNode(l,ln);
8967 if (listLength(l) == 0)
8968 dictDelete(c->db->io_keys,key);
8969
8970 return listLength(c->io_keys) == 0;
8971}
8972
8973static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8974 struct dictEntry *de;
8975 list *l;
8976 listNode *ln;
8977 int len;
8978
8979 de = dictFind(db->io_keys,key);
8980 if (!de) return;
8981
8982 l = dictGetEntryVal(de);
8983 len = listLength(l);
8984 /* Note: we can't use something like while(listLength(l)) as the list
8985 * can be freed by the calling function when we remove the last element. */
8986 while (len--) {
8987 ln = listFirst(l);
8988 redisClient *c = ln->value;
8989
8990 if (dontWaitForSwappedKey(c,key)) {
8991 /* Put the client in the list of clients ready to go as we
8992 * loaded all the keys about it. */
8993 listAddNodeTail(server.io_ready_clients,c);
8994 }
8995 }
b0d8747d 8996}
b0d8747d 8997
7f957c92 8998/* ================================= Debugging ============================== */
8999
9000static void debugCommand(redisClient *c) {
9001 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9002 *((char*)-1) = 'x';
210e29f7 9003 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9004 if (rdbSave(server.dbfilename) != REDIS_OK) {
9005 addReply(c,shared.err);
9006 return;
9007 }
9008 emptyDb();
9009 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9010 addReply(c,shared.err);
9011 return;
9012 }
9013 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9014 addReply(c,shared.ok);
71c2b467 9015 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9016 emptyDb();
9017 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9018 addReply(c,shared.err);
9019 return;
9020 }
9021 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9022 addReply(c,shared.ok);
333298da 9023 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9024 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9025 robj *key, *val;
9026
9027 if (!de) {
9028 addReply(c,shared.nokeyerr);
9029 return;
9030 }
9031 key = dictGetEntryKey(de);
9032 val = dictGetEntryVal(de);
59146ef3 9033 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9034 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 9035 char *strenc;
9036 char buf[128];
9037
9038 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9039 strenc = strencoding[val->encoding];
9040 } else {
9041 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9042 strenc = buf;
9043 }
ace06542 9044 addReplySds(c,sdscatprintf(sdsempty(),
9045 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 9046 "encoding:%s serializedlength:%lld\r\n",
682ac724 9047 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 9048 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 9049 } else {
9050 addReplySds(c,sdscatprintf(sdsempty(),
9051 "+Key at:%p refcount:%d, value swapped at: page %llu "
9052 "using %llu pages\r\n",
9053 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9054 (unsigned long long) key->vm.usedpages));
9055 }
7d30035d 9056 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9057 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9058 robj *key, *val;
9059
9060 if (!server.vm_enabled) {
9061 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9062 return;
9063 }
9064 if (!de) {
9065 addReply(c,shared.nokeyerr);
9066 return;
9067 }
9068 key = dictGetEntryKey(de);
9069 val = dictGetEntryVal(de);
4ef8de8a 9070 /* If the key is shared we want to create a copy */
9071 if (key->refcount > 1) {
9072 robj *newkey = dupStringObject(key);
9073 decrRefCount(key);
9074 key = dictGetEntryKey(de) = newkey;
9075 }
9076 /* Swap it */
7d30035d 9077 if (key->storage != REDIS_VM_MEMORY) {
9078 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 9079 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 9080 dictGetEntryVal(de) = NULL;
9081 addReply(c,shared.ok);
9082 } else {
9083 addReply(c,shared.err);
9084 }
7f957c92 9085 } else {
333298da 9086 addReplySds(c,sdsnew(
7d30035d 9087 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 9088 }
9089}
56906eef 9090
6c96ba7d 9091static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 9092 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 9093 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 9094#ifdef HAVE_BACKTRACE
9095 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9096 *((char*)-1) = 'x';
9097#endif
9098}
9099
bcfc686d 9100/* =================================== Main! ================================ */
56906eef 9101
bcfc686d 9102#ifdef __linux__
9103int linuxOvercommitMemoryValue(void) {
9104 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9105 char buf[64];
56906eef 9106
bcfc686d 9107 if (!fp) return -1;
9108 if (fgets(buf,64,fp) == NULL) {
9109 fclose(fp);
9110 return -1;
9111 }
9112 fclose(fp);
56906eef 9113
bcfc686d 9114 return atoi(buf);
9115}
9116
9117void linuxOvercommitMemoryWarning(void) {
9118 if (linuxOvercommitMemoryValue() == 0) {
9119 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9120 }
9121}
9122#endif /* __linux__ */
9123
9124static void daemonize(void) {
9125 int fd;
9126 FILE *fp;
9127
9128 if (fork() != 0) exit(0); /* parent exits */
9129 setsid(); /* create a new session */
9130
9131 /* Every output goes to /dev/null. If Redis is daemonized but
9132 * the 'logfile' is set to 'stdout' in the configuration file
9133 * it will not log at all. */
9134 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9135 dup2(fd, STDIN_FILENO);
9136 dup2(fd, STDOUT_FILENO);
9137 dup2(fd, STDERR_FILENO);
9138 if (fd > STDERR_FILENO) close(fd);
9139 }
9140 /* Try to write the pid file */
9141 fp = fopen(server.pidfile,"w");
9142 if (fp) {
9143 fprintf(fp,"%d\n",getpid());
9144 fclose(fp);
56906eef 9145 }
56906eef 9146}
9147
bcfc686d 9148int main(int argc, char **argv) {
9651a787 9149 time_t start;
9150
bcfc686d 9151 initServerConfig();
9152 if (argc == 2) {
9153 resetServerSaveParams();
9154 loadServerConfig(argv[1]);
9155 } else if (argc > 2) {
9156 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9157 exit(1);
9158 } else {
9159 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9160 }
bcfc686d 9161 if (server.daemonize) daemonize();
71c54b21 9162 initServer();
bcfc686d 9163 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9164#ifdef __linux__
9165 linuxOvercommitMemoryWarning();
9166#endif
9651a787 9167 start = time(NULL);
bcfc686d 9168 if (server.appendonly) {
9169 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 9170 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 9171 } else {
9172 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 9173 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 9174 }
bcfc686d 9175 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 9176 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 9177 aeMain(server.el);
9178 aeDeleteEventLoop(server.el);
9179 return 0;
9180}
9181
9182/* ============================= Backtrace support ========================= */
9183
9184#ifdef HAVE_BACKTRACE
9185static char *findFuncName(void *pointer, unsigned long *offset);
9186
56906eef 9187static void *getMcontextEip(ucontext_t *uc) {
9188#if defined(__FreeBSD__)
9189 return (void*) uc->uc_mcontext.mc_eip;
9190#elif defined(__dietlibc__)
9191 return (void*) uc->uc_mcontext.eip;
06db1f50 9192#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 9193 #if __x86_64__
9194 return (void*) uc->uc_mcontext->__ss.__rip;
9195 #else
56906eef 9196 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 9197 #endif
06db1f50 9198#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 9199 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 9200 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 9201 #else
9202 return (void*) uc->uc_mcontext->__ss.__eip;
9203 #endif
54bac49d 9204#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 9205 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 9206#elif defined(__ia64__) /* Linux IA64 */
9207 return (void*) uc->uc_mcontext.sc_ip;
9208#else
9209 return NULL;
56906eef 9210#endif
9211}
9212
9213static void segvHandler(int sig, siginfo_t *info, void *secret) {
9214 void *trace[100];
9215 char **messages = NULL;
9216 int i, trace_size = 0;
9217 unsigned long offset=0;
56906eef 9218 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 9219 sds infostring;
56906eef 9220 REDIS_NOTUSED(info);
9221
9222 redisLog(REDIS_WARNING,
9223 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 9224 infostring = genRedisInfoString();
9225 redisLog(REDIS_WARNING, "%s",infostring);
9226 /* It's not safe to sdsfree() the returned string under memory
9227 * corruption conditions. Let it leak as we are going to abort */
56906eef 9228
9229 trace_size = backtrace(trace, 100);
de96dbfe 9230 /* overwrite sigaction with caller's address */
b91cf5ef 9231 if (getMcontextEip(uc) != NULL) {
9232 trace[1] = getMcontextEip(uc);
9233 }
56906eef 9234 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 9235
d76412d1 9236 for (i=1; i<trace_size; ++i) {
56906eef 9237 char *fn = findFuncName(trace[i], &offset), *p;
9238
9239 p = strchr(messages[i],'+');
9240 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9241 redisLog(REDIS_WARNING,"%s", messages[i]);
9242 } else {
9243 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9244 }
9245 }
b177fd30 9246 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 9247 _exit(0);
fe3bbfbe 9248}
56906eef 9249
9250static void setupSigSegvAction(void) {
9251 struct sigaction act;
9252
9253 sigemptyset (&act.sa_mask);
9254 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9255 * is used. Otherwise, sa_handler is used */
9256 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9257 act.sa_sigaction = segvHandler;
9258 sigaction (SIGSEGV, &act, NULL);
9259 sigaction (SIGBUS, &act, NULL);
12fea928 9260 sigaction (SIGFPE, &act, NULL);
9261 sigaction (SIGILL, &act, NULL);
9262 sigaction (SIGBUS, &act, NULL);
e65fdc78 9263 return;
56906eef 9264}
e65fdc78 9265
bcfc686d 9266#include "staticsymbols.h"
9267/* This function try to convert a pointer into a function name. It's used in
9268 * oreder to provide a backtrace under segmentation fault that's able to
9269 * display functions declared as static (otherwise the backtrace is useless). */
9270static char *findFuncName(void *pointer, unsigned long *offset){
9271 int i, ret = -1;
9272 unsigned long off, minoff = 0;
ed9b544e 9273
bcfc686d 9274 /* Try to match against the Symbol with the smallest offset */
9275 for (i=0; symsTable[i].pointer; i++) {
9276 unsigned long lp = (unsigned long) pointer;
0bc03378 9277
bcfc686d 9278 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9279 off=lp-symsTable[i].pointer;
9280 if (ret < 0 || off < minoff) {
9281 minoff=off;
9282 ret=i;
9283 }
9284 }
0bc03378 9285 }
bcfc686d 9286 if (ret == -1) return NULL;
9287 *offset = minoff;
9288 return symsTable[ret].name;
0bc03378 9289}
bcfc686d 9290#else /* HAVE_BACKTRACE */
9291static void setupSigSegvAction(void) {
0bc03378 9292}
bcfc686d 9293#endif /* HAVE_BACKTRACE */
0bc03378 9294
ed9b544e 9295
ed9b544e 9296
bcfc686d 9297/* The End */
9298
9299
ed9b544e 9300