]> git.saurik.com Git - redis.git/blame - redis.c
implemented strategy that doesn't use free blocks in zipmaps
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
a9c723ea 30#define REDIS_VERSION "1.3.7"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
54bac49d 41#define __USE_UNIX98
ed9b544e 42#include <signal.h>
fbf9bcdb 43
44#ifdef HAVE_BACKTRACE
c9468bcf 45#include <execinfo.h>
46#include <ucontext.h>
fbf9bcdb 47#endif /* HAVE_BACKTRACE */
48
ed9b544e 49#include <sys/wait.h>
50#include <errno.h>
51#include <assert.h>
52#include <ctype.h>
53#include <stdarg.h>
54#include <inttypes.h>
55#include <arpa/inet.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#include <sys/time.h>
59#include <sys/resource.h>
2895e862 60#include <sys/uio.h>
f78fd11b 61#include <limits.h>
a7866db6 62#include <math.h>
92f8e882 63#include <pthread.h>
0bc1b2f6 64
65#if defined(__sun)
5043dff3 66#include "solarisfixes.h"
67#endif
ed9b544e 68
c9468bcf 69#include "redis.h"
ed9b544e 70#include "ae.h" /* Event driven programming library */
71#include "sds.h" /* Dynamic safe strings */
72#include "anet.h" /* Networking the easy way */
73#include "dict.h" /* Hash tables */
74#include "adlist.h" /* Linked lists */
75#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 76#include "lzf.h" /* LZF compression library */
77#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 78#include "zipmap.h"
ed9b544e 79
80/* Error codes */
81#define REDIS_OK 0
82#define REDIS_ERR -1
83
84/* Static server configuration */
85#define REDIS_SERVERPORT 6379 /* TCP port */
86#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 87#define REDIS_IOBUF_LEN 1024
ed9b544e 88#define REDIS_LOADBUF_LEN 1024
248ea310 89#define REDIS_STATIC_ARGS 8
ed9b544e 90#define REDIS_DEFAULT_DBNUM 16
91#define REDIS_CONFIGLINE_MAX 1024
92#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
1763929f 94#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
6f376729 95#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 96#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99#define REDIS_WRITEV_THRESHOLD 3
100/* Max number of iovecs used for each writev call */
101#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 102
103/* Hash table parameters */
104#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 105
106/* Command flags */
3fd78bcd 107#define REDIS_CMD_BULK 1 /* Bulk write command */
108#define REDIS_CMD_INLINE 2 /* Inline command */
109/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113#define REDIS_CMD_DENYOOM 4
ed9b544e 114
115/* Object types */
116#define REDIS_STRING 0
117#define REDIS_LIST 1
118#define REDIS_SET 2
1812e024 119#define REDIS_ZSET 3
120#define REDIS_HASH 4
f78fd11b 121
5234952b 122/* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
942a3961 125#define REDIS_ENCODING_RAW 0 /* Raw representation */
126#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 127#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 129
07efaf74 130static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132};
133
f78fd11b 134/* Object types only used for dumping to disk */
bb32ede5 135#define REDIS_EXPIRETIME 253
ed9b544e 136#define REDIS_SELECTDB 254
137#define REDIS_EOF 255
138
f78fd11b 139/* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
f78fd11b 149 *
10c43610 150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
f78fd11b 152#define REDIS_RDB_6BITLEN 0
153#define REDIS_RDB_14BITLEN 1
154#define REDIS_RDB_32BITLEN 2
17be1a4a 155#define REDIS_RDB_ENCVAL 3
f78fd11b 156#define REDIS_RDB_LENERR UINT_MAX
157
a4d1ba9a 158/* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 164#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 165
75680a3c 166/* Virtual memory object->where field. */
167#define REDIS_VM_MEMORY 0 /* The object is on memory */
168#define REDIS_VM_SWAPPED 1 /* The object is on disk */
169#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
06224fec 172/* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174#define REDIS_VM_MAX_NEAR_PAGES 65536
175#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 176#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 177#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 178/* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
c953f24b 182#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 183
ed9b544e 184/* Client flags */
d5d55fc3 185#define REDIS_SLAVE 1 /* This client is a slave server */
186#define REDIS_MASTER 2 /* This client is a master server */
187#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188#define REDIS_MULTI 8 /* This client is in a MULTI context */
189#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 191
40d224a9 192/* Slave replication state - slave side */
ed9b544e 193#define REDIS_REPL_NONE 0 /* No active replication */
194#define REDIS_REPL_CONNECT 1 /* Must connect to master */
195#define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
40d224a9 197/* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
ed9b544e 206/* List related stuff */
207#define REDIS_HEAD 0
208#define REDIS_TAIL 1
209
210/* Sort operations */
211#define REDIS_SORT_GET 0
443c6409 212#define REDIS_SORT_ASC 1
213#define REDIS_SORT_DESC 2
ed9b544e 214#define REDIS_SORTKEY_MAX 1024
215
216/* Log levels */
217#define REDIS_DEBUG 0
f870935d 218#define REDIS_VERBOSE 1
219#define REDIS_NOTICE 2
220#define REDIS_WARNING 3
ed9b544e 221
222/* Anti-warning macro... */
223#define REDIS_NOTUSED(V) ((void) V)
224
6b47e12e 225#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 227
48f0308a 228/* Append only defines */
229#define APPENDFSYNC_NO 0
230#define APPENDFSYNC_ALWAYS 1
231#define APPENDFSYNC_EVERYSEC 2
232
cbba7dd7 233/* Hashes related defaults */
234#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
dfc5e96c 237/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 238#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
6c96ba7d 239static void _redisAssert(char *estr, char *file, int line);
dfc5e96c 240
ed9b544e 241/*================================= Data types ============================== */
242
243/* A redis object, that is a type able to hold a string / list / set */
75680a3c 244
245/* The VM object structure */
246struct redisObjectVM {
3a66edc7 247 off_t page; /* the page at witch the object is stored on disk */
248 off_t usedpages; /* number of pages used on disk */
249 time_t atime; /* Last access time */
75680a3c 250} vm;
251
252/* The actual Redis Object */
ed9b544e 253typedef struct redisObject {
ed9b544e 254 void *ptr;
942a3961 255 unsigned char type;
256 unsigned char encoding;
d894161b 257 unsigned char storage; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
ed9b544e 261 int refcount;
75680a3c 262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm;
ed9b544e 267} robj;
268
dfc5e96c 269/* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273#define initStaticStringObject(_var,_ptr) do { \
274 _var.refcount = 1; \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
277 _var.ptr = _ptr; \
3a66edc7 278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 279} while(0);
280
3305306f 281typedef struct redisDb {
4409877e 282 dict *dict; /* The keyspace for this DB */
283 dict *expires; /* Timeout of keys with a timeout set */
284 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 285 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 286 int id;
287} redisDb;
288
6e469882 289/* Client MULTI/EXEC state */
290typedef struct multiCmd {
291 robj **argv;
292 int argc;
293 struct redisCommand *cmd;
294} multiCmd;
295
296typedef struct multiState {
297 multiCmd *commands; /* Array of MULTI commands */
298 int count; /* Total number of MULTI commands */
299} multiState;
300
ed9b544e 301/* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303typedef struct redisClient {
304 int fd;
3305306f 305 redisDb *db;
ed9b544e 306 int dictid;
307 sds querybuf;
e8a74421 308 robj **argv, **mbargv;
309 int argc, mbargc;
40d224a9 310 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 311 int multibulk; /* multi bulk command format active */
ed9b544e 312 list *reply;
313 int sentlen;
314 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 315 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 316 int slaveseldb; /* slave selected db, if this client is a slave */
317 int authenticated; /* when requirepass is non-NULL */
318 int replstate; /* replication state if this is a slave */
319 int repldbfd; /* replication DB file descriptor */
6e469882 320 long repldboff; /* replication DB file offset */
40d224a9 321 off_t repldbsize; /* replication DB file size */
6e469882 322 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 323 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 324 * operation such as BLPOP. Otherwise NULL. */
b177fd30 325 int blockingkeysnum; /* Number of blocking keys */
4409877e 326 time_t blockingto; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
92f8e882 328 list *io_keys; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
ed9b544e 330} redisClient;
331
332struct saveparam {
333 time_t seconds;
334 int changes;
335};
336
337/* Global server state structure */
338struct redisServer {
339 int port;
340 int fd;
3305306f 341 redisDb *db;
4409877e 342 dict *sharingpool; /* Poll used for object sharing */
10c43610 343 unsigned int sharingpoolsize;
ed9b544e 344 long long dirty; /* changes to DB from the last save */
345 list *clients;
87eca727 346 list *slaves, *monitors;
ed9b544e 347 char neterr[ANET_ERR_LEN];
348 aeEventLoop *el;
349 int cronloops; /* number of times the cron function run */
350 list *objfreelist; /* A list of freed objects to avoid malloc() */
351 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 352 /* Fields used only for stats */
353 time_t stat_starttime; /* server start time */
354 long long stat_numcommands; /* number of processed commands */
355 long long stat_numconnections; /* number of connections received */
2a6a2ed1 356 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 357 /* Configuration */
358 int verbosity;
359 int glueoutputbuf;
360 int maxidletime;
361 int dbnum;
362 int daemonize;
44b38ef4 363 int appendonly;
48f0308a 364 int appendfsync;
365 time_t lastfsync;
44b38ef4 366 int appendfd;
367 int appendseldb;
ed329fcf 368 char *pidfile;
9f3c422c 369 pid_t bgsavechildpid;
9d65a1bb 370 pid_t bgrewritechildpid;
371 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 372 struct saveparam *saveparams;
373 int saveparamslen;
374 char *logfile;
375 char *bindaddr;
376 char *dbfilename;
44b38ef4 377 char *appendfilename;
abcb223e 378 char *requirepass;
10c43610 379 int shareobjects;
121f70cf 380 int rdbcompression;
ed9b544e 381 /* Replication related */
382 int isslave;
d0ccebcf 383 char *masterauth;
ed9b544e 384 char *masterhost;
385 int masterport;
40d224a9 386 redisClient *master; /* client that is master for this slave */
ed9b544e 387 int replstate;
285add55 388 unsigned int maxclients;
4ef8de8a 389 unsigned long long maxmemory;
d5d55fc3 390 unsigned int blpop_blocked_clients;
391 unsigned int vm_blocked_clients;
ed9b544e 392 /* Sort parameters - qsort_r() is only available under BSD so we
393 * have to take this state global, in order to pass it to sortCompare() */
394 int sort_desc;
395 int sort_alpha;
396 int sort_bypattern;
75680a3c 397 /* Virtual memory configuration */
398 int vm_enabled;
054e426d 399 char *vm_swap_file;
75680a3c 400 off_t vm_page_size;
401 off_t vm_pages;
4ef8de8a 402 unsigned long long vm_max_memory;
cbba7dd7 403 /* Hashes config */
404 size_t hash_max_zipmap_entries;
405 size_t hash_max_zipmap_value;
75680a3c 406 /* Virtual memory state */
407 FILE *vm_fp;
408 int vm_fd;
409 off_t vm_next_page; /* Next probably empty page */
410 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 411 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 412 time_t unixtime; /* Unix time sampled every second. */
92f8e882 413 /* Virtual memory I/O threads stuff */
92f8e882 414 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 415 * put the result of the operation in the io_done list. While the
416 * job is being processed, it's put on io_processing queue. */
417 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
418 list *io_processing; /* List of VM I/O jobs being processed */
419 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 420 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 421 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 422 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
423 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 424 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 425 int io_active_threads; /* Number of running I/O threads */
426 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 427 /* Our main thread is blocked on the event loop, locking for sockets ready
428 * to be read or written, so when a threaded I/O operation is ready to be
429 * processed by the main thread, the I/O thread will use a unix pipe to
430 * awake the main thread. The followings are the two pipe FDs. */
431 int io_ready_pipe_read;
432 int io_ready_pipe_write;
7d98e08c 433 /* Virtual memory stats */
434 unsigned long long vm_stats_used_pages;
435 unsigned long long vm_stats_swapped_objects;
436 unsigned long long vm_stats_swapouts;
437 unsigned long long vm_stats_swapins;
b9bc0eef 438 FILE *devnull;
ed9b544e 439};
440
441typedef void redisCommandProc(redisClient *c);
442struct redisCommand {
443 char *name;
444 redisCommandProc *proc;
445 int arity;
446 int flags;
76583ea4
PN
447 /* Use a function to determine which keys need to be loaded
448 * in the background prior to executing this command. Takes precedence
449 * over vm_firstkey and others, ignored when NULL */
450 redisCommandProc *vm_preload_proc;
7c775e09 451 /* What keys should be loaded in background when calling this command? */
452 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
453 int vm_lastkey; /* THe last argument that's a key */
454 int vm_keystep; /* The step between first and last key */
ed9b544e 455};
456
de96dbfe 457struct redisFunctionSym {
458 char *name;
56906eef 459 unsigned long pointer;
de96dbfe 460};
461
ed9b544e 462typedef struct _redisSortObject {
463 robj *obj;
464 union {
465 double score;
466 robj *cmpobj;
467 } u;
468} redisSortObject;
469
470typedef struct _redisSortOperation {
471 int type;
472 robj *pattern;
473} redisSortOperation;
474
6b47e12e 475/* ZSETs use a specialized version of Skiplists */
476
477typedef struct zskiplistNode {
478 struct zskiplistNode **forward;
e3870fab 479 struct zskiplistNode *backward;
912b9165 480 unsigned int *span;
6b47e12e 481 double score;
482 robj *obj;
483} zskiplistNode;
484
485typedef struct zskiplist {
e3870fab 486 struct zskiplistNode *header, *tail;
d13f767c 487 unsigned long length;
6b47e12e 488 int level;
489} zskiplist;
490
1812e024 491typedef struct zset {
492 dict *dict;
6b47e12e 493 zskiplist *zsl;
1812e024 494} zset;
495
6b47e12e 496/* Our shared "common" objects */
497
ed9b544e 498struct sharedObjectsStruct {
c937aa89 499 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 500 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 501 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
502 *outofrangeerr, *plus,
ed9b544e 503 *select0, *select1, *select2, *select3, *select4,
504 *select5, *select6, *select7, *select8, *select9;
505} shared;
506
a7866db6 507/* Global vars that are actally used as constants. The following double
508 * values are used for double on-disk serialization, and are initialized
509 * at runtime to avoid strange compiler optimizations. */
510
511static double R_Zero, R_PosInf, R_NegInf, R_Nan;
512
92f8e882 513/* VM threaded I/O request message */
b9bc0eef 514#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
515#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
516#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 517typedef struct iojob {
996cb5f7 518 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 519 redisDb *db;/* Redis database */
92f8e882 520 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 521 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 522 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
523 off_t page; /* Swap page where to read/write the object */
248ea310 524 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 525 int canceled; /* True if this command was canceled by blocking side of VM */
526 pthread_t thread; /* ID of the thread processing this entry */
527} iojob;
92f8e882 528
ed9b544e 529/*================================ Prototypes =============================== */
530
531static void freeStringObject(robj *o);
532static void freeListObject(robj *o);
533static void freeSetObject(robj *o);
534static void decrRefCount(void *o);
535static robj *createObject(int type, void *ptr);
536static void freeClient(redisClient *c);
f78fd11b 537static int rdbLoad(char *filename);
ed9b544e 538static void addReply(redisClient *c, robj *obj);
539static void addReplySds(redisClient *c, sds s);
540static void incrRefCount(robj *o);
f78fd11b 541static int rdbSaveBackground(char *filename);
ed9b544e 542static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 543static robj *dupStringObject(robj *o);
248ea310 544static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
44b38ef4 545static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 546static int syncWithMaster(void);
10c43610 547static robj *tryObjectSharing(robj *o);
942a3961 548static int tryObjectEncoding(robj *o);
9d65a1bb 549static robj *getDecodedObject(robj *o);
3305306f 550static int removeExpire(redisDb *db, robj *key);
551static int expireIfNeeded(redisDb *db, robj *key);
552static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 553static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 554static int deleteKey(redisDb *db, robj *key);
bb32ede5 555static time_t getExpire(redisDb *db, robj *key);
556static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 557static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 558static void freeMemoryIfNeeded(void);
de96dbfe 559static int processCommand(redisClient *c);
56906eef 560static void setupSigSegvAction(void);
a3b21203 561static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 562static void aofRemoveTempFile(pid_t childpid);
0ea663ea 563static size_t stringObjectLen(robj *o);
638e42ac 564static void processInputBuffer(redisClient *c);
6b47e12e 565static zskiplist *zslCreate(void);
fd8ccf44 566static void zslFree(zskiplist *zsl);
2b59cfdf 567static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 568static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 569static void initClientMultiState(redisClient *c);
570static void freeClientMultiState(redisClient *c);
571static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 572static void unblockClientWaitingData(redisClient *c);
4409877e 573static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 574static void vmInit(void);
a35ddf12 575static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 576static robj *vmLoadObject(robj *key);
7e69548d 577static robj *vmPreviewObject(robj *key);
a69a0c9c 578static int vmSwapOneObjectBlocking(void);
579static int vmSwapOneObjectThreaded(void);
7e69548d 580static int vmCanSwapOut(void);
a5819310 581static int tryFreeOneObjectFromFreelist(void);
996cb5f7 582static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
583static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
584static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 585static void lockThreadedIO(void);
586static void unlockThreadedIO(void);
587static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
588static void freeIOJob(iojob *j);
589static void queueIOJob(iojob *j);
a5819310 590static int vmWriteObjectOnSwap(robj *o, off_t page);
591static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 592static void waitEmptyIOJobsQueue(void);
593static void vmReopenSwapFile(void);
970e10bb 594static int vmFreePage(off_t page);
76583ea4 595static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
d5d55fc3 596static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
597static int dontWaitForSwappedKey(redisClient *c, robj *key);
598static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
599static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
600static struct redisCommand *lookupCommand(char *name);
601static void call(redisClient *c, struct redisCommand *cmd);
602static void resetClient(redisClient *c);
ada386b2 603static void convertToRealHash(robj *o);
ed9b544e 604
abcb223e 605static void authCommand(redisClient *c);
ed9b544e 606static void pingCommand(redisClient *c);
607static void echoCommand(redisClient *c);
608static void setCommand(redisClient *c);
609static void setnxCommand(redisClient *c);
610static void getCommand(redisClient *c);
611static void delCommand(redisClient *c);
612static void existsCommand(redisClient *c);
613static void incrCommand(redisClient *c);
614static void decrCommand(redisClient *c);
615static void incrbyCommand(redisClient *c);
616static void decrbyCommand(redisClient *c);
617static void selectCommand(redisClient *c);
618static void randomkeyCommand(redisClient *c);
619static void keysCommand(redisClient *c);
620static void dbsizeCommand(redisClient *c);
621static void lastsaveCommand(redisClient *c);
622static void saveCommand(redisClient *c);
623static void bgsaveCommand(redisClient *c);
9d65a1bb 624static void bgrewriteaofCommand(redisClient *c);
ed9b544e 625static void shutdownCommand(redisClient *c);
626static void moveCommand(redisClient *c);
627static void renameCommand(redisClient *c);
628static void renamenxCommand(redisClient *c);
629static void lpushCommand(redisClient *c);
630static void rpushCommand(redisClient *c);
631static void lpopCommand(redisClient *c);
632static void rpopCommand(redisClient *c);
633static void llenCommand(redisClient *c);
634static void lindexCommand(redisClient *c);
635static void lrangeCommand(redisClient *c);
636static void ltrimCommand(redisClient *c);
637static void typeCommand(redisClient *c);
638static void lsetCommand(redisClient *c);
639static void saddCommand(redisClient *c);
640static void sremCommand(redisClient *c);
a4460ef4 641static void smoveCommand(redisClient *c);
ed9b544e 642static void sismemberCommand(redisClient *c);
643static void scardCommand(redisClient *c);
12fea928 644static void spopCommand(redisClient *c);
2abb95a9 645static void srandmemberCommand(redisClient *c);
ed9b544e 646static void sinterCommand(redisClient *c);
647static void sinterstoreCommand(redisClient *c);
40d224a9 648static void sunionCommand(redisClient *c);
649static void sunionstoreCommand(redisClient *c);
f4f56e1d 650static void sdiffCommand(redisClient *c);
651static void sdiffstoreCommand(redisClient *c);
ed9b544e 652static void syncCommand(redisClient *c);
653static void flushdbCommand(redisClient *c);
654static void flushallCommand(redisClient *c);
655static void sortCommand(redisClient *c);
656static void lremCommand(redisClient *c);
0f5f7e9a 657static void rpoplpushcommand(redisClient *c);
ed9b544e 658static void infoCommand(redisClient *c);
70003d28 659static void mgetCommand(redisClient *c);
87eca727 660static void monitorCommand(redisClient *c);
3305306f 661static void expireCommand(redisClient *c);
802e8373 662static void expireatCommand(redisClient *c);
f6b141c5 663static void getsetCommand(redisClient *c);
fd88489a 664static void ttlCommand(redisClient *c);
321b0e13 665static void slaveofCommand(redisClient *c);
7f957c92 666static void debugCommand(redisClient *c);
f6b141c5 667static void msetCommand(redisClient *c);
668static void msetnxCommand(redisClient *c);
fd8ccf44 669static void zaddCommand(redisClient *c);
7db723ad 670static void zincrbyCommand(redisClient *c);
cc812361 671static void zrangeCommand(redisClient *c);
50c55df5 672static void zrangebyscoreCommand(redisClient *c);
f44dd428 673static void zcountCommand(redisClient *c);
e3870fab 674static void zrevrangeCommand(redisClient *c);
3c41331e 675static void zcardCommand(redisClient *c);
1b7106e7 676static void zremCommand(redisClient *c);
6e333bbe 677static void zscoreCommand(redisClient *c);
1807985b 678static void zremrangebyscoreCommand(redisClient *c);
6e469882 679static void multiCommand(redisClient *c);
680static void execCommand(redisClient *c);
18b6cb76 681static void discardCommand(redisClient *c);
4409877e 682static void blpopCommand(redisClient *c);
683static void brpopCommand(redisClient *c);
4b00bebd 684static void appendCommand(redisClient *c);
39191553 685static void substrCommand(redisClient *c);
69d95c3e 686static void zrankCommand(redisClient *c);
798d9e55 687static void zrevrankCommand(redisClient *c);
978c2c94 688static void hsetCommand(redisClient *c);
689static void hgetCommand(redisClient *c);
07efaf74 690static void hdelCommand(redisClient *c);
92b27fe9 691static void hlenCommand(redisClient *c);
9212eafd 692static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
693static void zunionCommand(redisClient *c);
694static void zinterCommand(redisClient *c);
78409a0f 695static void hkeysCommand(redisClient *c);
696static void hvalsCommand(redisClient *c);
697static void hgetallCommand(redisClient *c);
a86f14b1 698static void hexistsCommand(redisClient *c);
500ece7c 699static void configCommand(redisClient *c);
01426b05 700static void hincrbyCommand(redisClient *c);
f6b141c5 701
ed9b544e 702/*================================= Globals ================================= */
703
704/* Global vars */
705static struct redisServer server; /* server global state */
706static struct redisCommand cmdTable[] = {
76583ea4
PN
707 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
708 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
709 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
710 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
711 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
712 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
713 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
714 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
715 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
716 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
717 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
718 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
719 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
720 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
721 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
722 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
723 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
724 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
725 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
726 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
727 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
728 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
729 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
730 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
731 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
732 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
733 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
734 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
735 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
738 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
739 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
740 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
741 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
742 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
743 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
745 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
747 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
750 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
751 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
757 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
758 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
759 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
01426b05 760 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
761 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
762 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
763 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 767 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
768 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
772 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
773 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
774 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
775 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
779 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
780 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
781 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
782 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
783 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
784 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
785 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
786 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
787 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
788 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
789 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
790 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"exec",execCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
793 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
795 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
796 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
799 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
800 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 803 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
76583ea4 804 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 805};
bcfc686d 806
723fb69b
AO
807static void usage();
808
ed9b544e 809/*============================ Utility functions ============================ */
810
811/* Glob-style pattern matching. */
500ece7c 812static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 813 const char *string, int stringLen, int nocase)
814{
815 while(patternLen) {
816 switch(pattern[0]) {
817 case '*':
818 while (pattern[1] == '*') {
819 pattern++;
820 patternLen--;
821 }
822 if (patternLen == 1)
823 return 1; /* match */
824 while(stringLen) {
825 if (stringmatchlen(pattern+1, patternLen-1,
826 string, stringLen, nocase))
827 return 1; /* match */
828 string++;
829 stringLen--;
830 }
831 return 0; /* no match */
832 break;
833 case '?':
834 if (stringLen == 0)
835 return 0; /* no match */
836 string++;
837 stringLen--;
838 break;
839 case '[':
840 {
841 int not, match;
842
843 pattern++;
844 patternLen--;
845 not = pattern[0] == '^';
846 if (not) {
847 pattern++;
848 patternLen--;
849 }
850 match = 0;
851 while(1) {
852 if (pattern[0] == '\\') {
853 pattern++;
854 patternLen--;
855 if (pattern[0] == string[0])
856 match = 1;
857 } else if (pattern[0] == ']') {
858 break;
859 } else if (patternLen == 0) {
860 pattern--;
861 patternLen++;
862 break;
863 } else if (pattern[1] == '-' && patternLen >= 3) {
864 int start = pattern[0];
865 int end = pattern[2];
866 int c = string[0];
867 if (start > end) {
868 int t = start;
869 start = end;
870 end = t;
871 }
872 if (nocase) {
873 start = tolower(start);
874 end = tolower(end);
875 c = tolower(c);
876 }
877 pattern += 2;
878 patternLen -= 2;
879 if (c >= start && c <= end)
880 match = 1;
881 } else {
882 if (!nocase) {
883 if (pattern[0] == string[0])
884 match = 1;
885 } else {
886 if (tolower((int)pattern[0]) == tolower((int)string[0]))
887 match = 1;
888 }
889 }
890 pattern++;
891 patternLen--;
892 }
893 if (not)
894 match = !match;
895 if (!match)
896 return 0; /* no match */
897 string++;
898 stringLen--;
899 break;
900 }
901 case '\\':
902 if (patternLen >= 2) {
903 pattern++;
904 patternLen--;
905 }
906 /* fall through */
907 default:
908 if (!nocase) {
909 if (pattern[0] != string[0])
910 return 0; /* no match */
911 } else {
912 if (tolower((int)pattern[0]) != tolower((int)string[0]))
913 return 0; /* no match */
914 }
915 string++;
916 stringLen--;
917 break;
918 }
919 pattern++;
920 patternLen--;
921 if (stringLen == 0) {
922 while(*pattern == '*') {
923 pattern++;
924 patternLen--;
925 }
926 break;
927 }
928 }
929 if (patternLen == 0 && stringLen == 0)
930 return 1;
931 return 0;
932}
933
500ece7c 934static int stringmatch(const char *pattern, const char *string, int nocase) {
935 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
936}
937
56906eef 938static void redisLog(int level, const char *fmt, ...) {
ed9b544e 939 va_list ap;
940 FILE *fp;
941
942 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
943 if (!fp) return;
944
945 va_start(ap, fmt);
946 if (level >= server.verbosity) {
6766f45e 947 char *c = ".-*#";
1904ecc1 948 char buf[64];
949 time_t now;
950
951 now = time(NULL);
6c9385e0 952 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 953 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 954 vfprintf(fp, fmt, ap);
955 fprintf(fp,"\n");
956 fflush(fp);
957 }
958 va_end(ap);
959
960 if (server.logfile) fclose(fp);
961}
962
963/*====================== Hash table type implementation ==================== */
964
965/* This is an hash table type that uses the SDS dynamic strings libary as
966 * keys and radis objects as values (objects can hold SDS strings,
967 * lists, sets). */
968
1812e024 969static void dictVanillaFree(void *privdata, void *val)
970{
971 DICT_NOTUSED(privdata);
972 zfree(val);
973}
974
4409877e 975static void dictListDestructor(void *privdata, void *val)
976{
977 DICT_NOTUSED(privdata);
978 listRelease((list*)val);
979}
980
ed9b544e 981static int sdsDictKeyCompare(void *privdata, const void *key1,
982 const void *key2)
983{
984 int l1,l2;
985 DICT_NOTUSED(privdata);
986
987 l1 = sdslen((sds)key1);
988 l2 = sdslen((sds)key2);
989 if (l1 != l2) return 0;
990 return memcmp(key1, key2, l1) == 0;
991}
992
993static void dictRedisObjectDestructor(void *privdata, void *val)
994{
995 DICT_NOTUSED(privdata);
996
a35ddf12 997 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 998 decrRefCount(val);
999}
1000
942a3961 1001static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1002 const void *key2)
1003{
1004 const robj *o1 = key1, *o2 = key2;
1005 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1006}
1007
942a3961 1008static unsigned int dictObjHash(const void *key) {
ed9b544e 1009 const robj *o = key;
1010 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1011}
1012
942a3961 1013static int dictEncObjKeyCompare(void *privdata, const void *key1,
1014 const void *key2)
1015{
9d65a1bb 1016 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1017 int cmp;
942a3961 1018
2a1198b4 1019 if (o1->encoding == REDIS_ENCODING_INT &&
1020 o2->encoding == REDIS_ENCODING_INT &&
db5946fc 1021 o1->ptr == o2->ptr) return 1;
2a1198b4 1022
9d65a1bb 1023 o1 = getDecodedObject(o1);
1024 o2 = getDecodedObject(o2);
1025 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1026 decrRefCount(o1);
1027 decrRefCount(o2);
1028 return cmp;
942a3961 1029}
1030
1031static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1032 robj *o = (robj*) key;
942a3961 1033
ed9e4966 1034 if (o->encoding == REDIS_ENCODING_RAW) {
1035 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1036 } else {
1037 if (o->encoding == REDIS_ENCODING_INT) {
1038 char buf[32];
1039 int len;
1040
1041 len = snprintf(buf,32,"%ld",(long)o->ptr);
1042 return dictGenHashFunction((unsigned char*)buf, len);
1043 } else {
1044 unsigned int hash;
1045
1046 o = getDecodedObject(o);
1047 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1048 decrRefCount(o);
1049 return hash;
1050 }
1051 }
942a3961 1052}
1053
f2d9f50f 1054/* Sets type and expires */
ed9b544e 1055static dictType setDictType = {
942a3961 1056 dictEncObjHash, /* hash function */
ed9b544e 1057 NULL, /* key dup */
1058 NULL, /* val dup */
942a3961 1059 dictEncObjKeyCompare, /* key compare */
ed9b544e 1060 dictRedisObjectDestructor, /* key destructor */
1061 NULL /* val destructor */
1062};
1063
f2d9f50f 1064/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1065static dictType zsetDictType = {
1066 dictEncObjHash, /* hash function */
1067 NULL, /* key dup */
1068 NULL, /* val dup */
1069 dictEncObjKeyCompare, /* key compare */
1070 dictRedisObjectDestructor, /* key destructor */
da0a1620 1071 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1072};
1073
f2d9f50f 1074/* Db->dict */
5234952b 1075static dictType dbDictType = {
942a3961 1076 dictObjHash, /* hash function */
ed9b544e 1077 NULL, /* key dup */
1078 NULL, /* val dup */
942a3961 1079 dictObjKeyCompare, /* key compare */
ed9b544e 1080 dictRedisObjectDestructor, /* key destructor */
1081 dictRedisObjectDestructor /* val destructor */
1082};
1083
f2d9f50f 1084/* Db->expires */
1085static dictType keyptrDictType = {
1086 dictObjHash, /* hash function */
1087 NULL, /* key dup */
1088 NULL, /* val dup */
1089 dictObjKeyCompare, /* key compare */
1090 dictRedisObjectDestructor, /* key destructor */
1091 NULL /* val destructor */
1092};
1093
5234952b 1094/* Hash type hash table (note that small hashes are represented with zimpaps) */
1095static dictType hashDictType = {
1096 dictEncObjHash, /* hash function */
1097 NULL, /* key dup */
1098 NULL, /* val dup */
1099 dictEncObjKeyCompare, /* key compare */
1100 dictRedisObjectDestructor, /* key destructor */
1101 dictRedisObjectDestructor /* val destructor */
1102};
1103
4409877e 1104/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1105 * lists as values. It's used for blocking operations (BLPOP) and to
1106 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1107static dictType keylistDictType = {
1108 dictObjHash, /* hash function */
1109 NULL, /* key dup */
1110 NULL, /* val dup */
1111 dictObjKeyCompare, /* key compare */
1112 dictRedisObjectDestructor, /* key destructor */
1113 dictListDestructor /* val destructor */
1114};
1115
42ab0172
AO
1116static void version();
1117
ed9b544e 1118/* ========================= Random utility functions ======================= */
1119
1120/* Redis generally does not try to recover from out of memory conditions
1121 * when allocating objects or strings, it is not clear if it will be possible
1122 * to report this condition to the client since the networking layer itself
1123 * is based on heap allocation for send buffers, so we simply abort.
1124 * At least the code will be simpler to read... */
1125static void oom(const char *msg) {
71c54b21 1126 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1127 sleep(1);
1128 abort();
1129}
1130
1131/* ====================== Redis server networking stuff ===================== */
56906eef 1132static void closeTimedoutClients(void) {
ed9b544e 1133 redisClient *c;
ed9b544e 1134 listNode *ln;
1135 time_t now = time(NULL);
c7df85a4 1136 listIter li;
ed9b544e 1137
c7df85a4 1138 listRewind(server.clients,&li);
1139 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1140 c = listNodeValue(ln);
f86a74e9 1141 if (server.maxidletime &&
1142 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1143 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
f86a74e9 1144 (now - c->lastinteraction > server.maxidletime))
1145 {
f870935d 1146 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1147 freeClient(c);
f86a74e9 1148 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1149 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1150 addReply(c,shared.nullmultibulk);
b0d8747d 1151 unblockClientWaitingData(c);
f86a74e9 1152 }
ed9b544e 1153 }
1154 }
ed9b544e 1155}
1156
12fea928 1157static int htNeedsResize(dict *dict) {
1158 long long size, used;
1159
1160 size = dictSlots(dict);
1161 used = dictSize(dict);
1162 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1163 (used*100/size < REDIS_HT_MINFILL));
1164}
1165
0bc03378 1166/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1167 * we resize the hash table to save memory */
56906eef 1168static void tryResizeHashTables(void) {
0bc03378 1169 int j;
1170
1171 for (j = 0; j < server.dbnum; j++) {
12fea928 1172 if (htNeedsResize(server.db[j].dict)) {
f870935d 1173 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
0bc03378 1174 dictResize(server.db[j].dict);
f870935d 1175 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
0bc03378 1176 }
12fea928 1177 if (htNeedsResize(server.db[j].expires))
1178 dictResize(server.db[j].expires);
0bc03378 1179 }
1180}
1181
9d65a1bb 1182/* A background saving child (BGSAVE) terminated its work. Handle this. */
1183void backgroundSaveDoneHandler(int statloc) {
1184 int exitcode = WEXITSTATUS(statloc);
1185 int bysignal = WIFSIGNALED(statloc);
1186
1187 if (!bysignal && exitcode == 0) {
1188 redisLog(REDIS_NOTICE,
1189 "Background saving terminated with success");
1190 server.dirty = 0;
1191 server.lastsave = time(NULL);
1192 } else if (!bysignal && exitcode != 0) {
1193 redisLog(REDIS_WARNING, "Background saving error");
1194 } else {
1195 redisLog(REDIS_WARNING,
1196 "Background saving terminated by signal");
1197 rdbRemoveTempFile(server.bgsavechildpid);
1198 }
1199 server.bgsavechildpid = -1;
1200 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1201 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1202 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1203}
1204
1205/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1206 * Handle this. */
1207void backgroundRewriteDoneHandler(int statloc) {
1208 int exitcode = WEXITSTATUS(statloc);
1209 int bysignal = WIFSIGNALED(statloc);
1210
1211 if (!bysignal && exitcode == 0) {
1212 int fd;
1213 char tmpfile[256];
1214
1215 redisLog(REDIS_NOTICE,
1216 "Background append only file rewriting terminated with success");
1217 /* Now it's time to flush the differences accumulated by the parent */
1218 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1219 fd = open(tmpfile,O_WRONLY|O_APPEND);
1220 if (fd == -1) {
1221 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1222 goto cleanup;
1223 }
1224 /* Flush our data... */
1225 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1226 (signed) sdslen(server.bgrewritebuf)) {
1227 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1228 close(fd);
1229 goto cleanup;
1230 }
b32627cd 1231 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1232 /* Now our work is to rename the temp file into the stable file. And
1233 * switch the file descriptor used by the server for append only. */
1234 if (rename(tmpfile,server.appendfilename) == -1) {
1235 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1236 close(fd);
1237 goto cleanup;
1238 }
1239 /* Mission completed... almost */
1240 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1241 if (server.appendfd != -1) {
1242 /* If append only is actually enabled... */
1243 close(server.appendfd);
1244 server.appendfd = fd;
1245 fsync(fd);
85a83172 1246 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1247 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1248 } else {
1249 /* If append only is disabled we just generate a dump in this
1250 * format. Why not? */
1251 close(fd);
1252 }
1253 } else if (!bysignal && exitcode != 0) {
1254 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1255 } else {
1256 redisLog(REDIS_WARNING,
1257 "Background append only file rewriting terminated by signal");
1258 }
1259cleanup:
1260 sdsfree(server.bgrewritebuf);
1261 server.bgrewritebuf = sdsempty();
1262 aofRemoveTempFile(server.bgrewritechildpid);
1263 server.bgrewritechildpid = -1;
1264}
1265
56906eef 1266static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1267 int j, loops = server.cronloops++;
ed9b544e 1268 REDIS_NOTUSED(eventLoop);
1269 REDIS_NOTUSED(id);
1270 REDIS_NOTUSED(clientData);
1271
3a66edc7 1272 /* We take a cached value of the unix time in the global state because
1273 * with virtual memory and aging there is to store the current time
1274 * in objects at every object access, and accuracy is not needed.
1275 * To access a global var is faster than calling time(NULL) */
1276 server.unixtime = time(NULL);
1277
0bc03378 1278 /* Show some info about non-empty databases */
ed9b544e 1279 for (j = 0; j < server.dbnum; j++) {
dec423d9 1280 long long size, used, vkeys;
94754ccc 1281
3305306f 1282 size = dictSlots(server.db[j].dict);
1283 used = dictSize(server.db[j].dict);
94754ccc 1284 vkeys = dictSize(server.db[j].expires);
1763929f 1285 if (!(loops % 50) && (used || vkeys)) {
f870935d 1286 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1287 /* dictPrintStats(server.dict); */
ed9b544e 1288 }
ed9b544e 1289 }
1290
0bc03378 1291 /* We don't want to resize the hash tables while a bacground saving
1292 * is in progress: the saving child is created using fork() that is
1293 * implemented with a copy-on-write semantic in most modern systems, so
1294 * if we resize the HT while there is the saving child at work actually
1295 * a lot of memory movements in the parent will cause a lot of pages
1296 * copied. */
1763929f 1297 if (server.bgsavechildpid == -1 && !(loops % 10)) tryResizeHashTables();
0bc03378 1298
ed9b544e 1299 /* Show information about connected clients */
1763929f 1300 if (!(loops % 50)) {
f870935d 1301 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
ed9b544e 1302 listLength(server.clients)-listLength(server.slaves),
1303 listLength(server.slaves),
b72f6a4b 1304 zmalloc_used_memory(),
3305306f 1305 dictSize(server.sharingpool));
ed9b544e 1306 }
1307
1308 /* Close connections of timedout clients */
1763929f 1309 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1310 closeTimedoutClients();
1311
9d65a1bb 1312 /* Check if a background saving or AOF rewrite in progress terminated */
1313 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1314 int statloc;
9d65a1bb 1315 pid_t pid;
1316
1317 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1318 if (pid == server.bgsavechildpid) {
1319 backgroundSaveDoneHandler(statloc);
ed9b544e 1320 } else {
9d65a1bb 1321 backgroundRewriteDoneHandler(statloc);
ed9b544e 1322 }
ed9b544e 1323 }
1324 } else {
1325 /* If there is not a background saving in progress check if
1326 * we have to save now */
1327 time_t now = time(NULL);
1328 for (j = 0; j < server.saveparamslen; j++) {
1329 struct saveparam *sp = server.saveparams+j;
1330
1331 if (server.dirty >= sp->changes &&
1332 now-server.lastsave > sp->seconds) {
1333 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1334 sp->changes, sp->seconds);
f78fd11b 1335 rdbSaveBackground(server.dbfilename);
ed9b544e 1336 break;
1337 }
1338 }
1339 }
94754ccc 1340
f2324293 1341 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1342 * will use few CPU cycles if there are few expiring keys, otherwise
1343 * it will get more aggressive to avoid that too much memory is used by
1344 * keys that can be removed from the keyspace. */
94754ccc 1345 for (j = 0; j < server.dbnum; j++) {
f2324293 1346 int expired;
94754ccc 1347 redisDb *db = server.db+j;
94754ccc 1348
f2324293 1349 /* Continue to expire if at the end of the cycle more than 25%
1350 * of the keys were expired. */
1351 do {
4ef8de8a 1352 long num = dictSize(db->expires);
94754ccc 1353 time_t now = time(NULL);
1354
f2324293 1355 expired = 0;
94754ccc 1356 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1357 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1358 while (num--) {
1359 dictEntry *de;
1360 time_t t;
1361
1362 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1363 t = (time_t) dictGetEntryVal(de);
1364 if (now > t) {
1365 deleteKey(db,dictGetEntryKey(de));
f2324293 1366 expired++;
2a6a2ed1 1367 server.stat_expiredkeys++;
94754ccc 1368 }
1369 }
f2324293 1370 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1371 }
1372
4ef8de8a 1373 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1374 * is enbled. Try to free objects from the free list first. */
7e69548d 1375 if (vmCanSwapOut()) {
1376 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1377 server.vm_max_memory)
1378 {
72e9fd40 1379 int retval;
1380
a5819310 1381 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1382 retval = (server.vm_max_threads == 0) ?
1383 vmSwapOneObjectBlocking() :
1384 vmSwapOneObjectThreaded();
1763929f 1385 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1386 zmalloc_used_memory() >
1387 (server.vm_max_memory+server.vm_max_memory/10))
1388 {
1389 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1390 }
72e9fd40 1391 /* Note that when using threade I/O we free just one object,
1392 * because anyway when the I/O thread in charge to swap this
1393 * object out will finish, the handler of completed jobs
1394 * will try to swap more objects if we are still out of memory. */
1395 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1396 }
1397 }
1398
ed9b544e 1399 /* Check if we should connect to a MASTER */
1763929f 1400 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1401 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1402 if (syncWithMaster() == REDIS_OK) {
1403 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1404 }
1405 }
1763929f 1406 return 100;
ed9b544e 1407}
1408
d5d55fc3 1409/* This function gets called every time Redis is entering the
1410 * main loop of the event driven library, that is, before to sleep
1411 * for ready file descriptors. */
1412static void beforeSleep(struct aeEventLoop *eventLoop) {
1413 REDIS_NOTUSED(eventLoop);
1414
1415 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1416 listIter li;
1417 listNode *ln;
1418
1419 listRewind(server.io_ready_clients,&li);
1420 while((ln = listNext(&li))) {
1421 redisClient *c = ln->value;
1422 struct redisCommand *cmd;
1423
1424 /* Resume the client. */
1425 listDelNode(server.io_ready_clients,ln);
1426 c->flags &= (~REDIS_IO_WAIT);
1427 server.vm_blocked_clients--;
1428 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1429 readQueryFromClient, c);
1430 cmd = lookupCommand(c->argv[0]->ptr);
1431 assert(cmd != NULL);
1432 call(c,cmd);
1433 resetClient(c);
1434 /* There may be more data to process in the input buffer. */
1435 if (c->querybuf && sdslen(c->querybuf) > 0)
1436 processInputBuffer(c);
1437 }
1438 }
1439}
1440
ed9b544e 1441static void createSharedObjects(void) {
1442 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1443 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1444 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1445 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1446 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1447 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1448 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1449 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1450 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1451 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1452 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1453 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1454 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1455 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1456 "-ERR no such key\r\n"));
ed9b544e 1457 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1458 "-ERR syntax error\r\n"));
c937aa89 1459 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1460 "-ERR source and destination objects are the same\r\n"));
1461 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1462 "-ERR index out of range\r\n"));
ed9b544e 1463 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1464 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1465 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1466 shared.select0 = createStringObject("select 0\r\n",10);
1467 shared.select1 = createStringObject("select 1\r\n",10);
1468 shared.select2 = createStringObject("select 2\r\n",10);
1469 shared.select3 = createStringObject("select 3\r\n",10);
1470 shared.select4 = createStringObject("select 4\r\n",10);
1471 shared.select5 = createStringObject("select 5\r\n",10);
1472 shared.select6 = createStringObject("select 6\r\n",10);
1473 shared.select7 = createStringObject("select 7\r\n",10);
1474 shared.select8 = createStringObject("select 8\r\n",10);
1475 shared.select9 = createStringObject("select 9\r\n",10);
1476}
1477
1478static void appendServerSaveParams(time_t seconds, int changes) {
1479 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1480 server.saveparams[server.saveparamslen].seconds = seconds;
1481 server.saveparams[server.saveparamslen].changes = changes;
1482 server.saveparamslen++;
1483}
1484
bcfc686d 1485static void resetServerSaveParams() {
ed9b544e 1486 zfree(server.saveparams);
1487 server.saveparams = NULL;
1488 server.saveparamslen = 0;
1489}
1490
1491static void initServerConfig() {
1492 server.dbnum = REDIS_DEFAULT_DBNUM;
1493 server.port = REDIS_SERVERPORT;
f870935d 1494 server.verbosity = REDIS_VERBOSE;
ed9b544e 1495 server.maxidletime = REDIS_MAXIDLETIME;
1496 server.saveparams = NULL;
1497 server.logfile = NULL; /* NULL = log on standard output */
1498 server.bindaddr = NULL;
1499 server.glueoutputbuf = 1;
1500 server.daemonize = 0;
44b38ef4 1501 server.appendonly = 0;
4e141d5a 1502 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1503 server.lastfsync = time(NULL);
44b38ef4 1504 server.appendfd = -1;
1505 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1506 server.pidfile = zstrdup("/var/run/redis.pid");
1507 server.dbfilename = zstrdup("dump.rdb");
1508 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1509 server.requirepass = NULL;
10c43610 1510 server.shareobjects = 0;
b0553789 1511 server.rdbcompression = 1;
21aecf4b 1512 server.sharingpoolsize = 1024;
285add55 1513 server.maxclients = 0;
d5d55fc3 1514 server.blpop_blocked_clients = 0;
3fd78bcd 1515 server.maxmemory = 0;
75680a3c 1516 server.vm_enabled = 0;
054e426d 1517 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1518 server.vm_page_size = 256; /* 256 bytes per page */
1519 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1520 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1521 server.vm_max_threads = 4;
d5d55fc3 1522 server.vm_blocked_clients = 0;
cbba7dd7 1523 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1524 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1525
bcfc686d 1526 resetServerSaveParams();
ed9b544e 1527
1528 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1529 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1530 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1531 /* Replication related */
1532 server.isslave = 0;
d0ccebcf 1533 server.masterauth = NULL;
ed9b544e 1534 server.masterhost = NULL;
1535 server.masterport = 6379;
1536 server.master = NULL;
1537 server.replstate = REDIS_REPL_NONE;
a7866db6 1538
1539 /* Double constants initialization */
1540 R_Zero = 0.0;
1541 R_PosInf = 1.0/R_Zero;
1542 R_NegInf = -1.0/R_Zero;
1543 R_Nan = R_Zero/R_Zero;
ed9b544e 1544}
1545
1546static void initServer() {
1547 int j;
1548
1549 signal(SIGHUP, SIG_IGN);
1550 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1551 setupSigSegvAction();
ed9b544e 1552
b9bc0eef 1553 server.devnull = fopen("/dev/null","w");
1554 if (server.devnull == NULL) {
1555 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1556 exit(1);
1557 }
ed9b544e 1558 server.clients = listCreate();
1559 server.slaves = listCreate();
87eca727 1560 server.monitors = listCreate();
ed9b544e 1561 server.objfreelist = listCreate();
1562 createSharedObjects();
1563 server.el = aeCreateEventLoop();
3305306f 1564 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
10c43610 1565 server.sharingpool = dictCreate(&setDictType,NULL);
ed9b544e 1566 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1567 if (server.fd == -1) {
1568 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1569 exit(1);
1570 }
3305306f 1571 for (j = 0; j < server.dbnum; j++) {
5234952b 1572 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1573 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1574 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1575 if (server.vm_enabled)
1576 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1577 server.db[j].id = j;
1578 }
ed9b544e 1579 server.cronloops = 0;
9f3c422c 1580 server.bgsavechildpid = -1;
9d65a1bb 1581 server.bgrewritechildpid = -1;
1582 server.bgrewritebuf = sdsempty();
ed9b544e 1583 server.lastsave = time(NULL);
1584 server.dirty = 0;
ed9b544e 1585 server.stat_numcommands = 0;
1586 server.stat_numconnections = 0;
2a6a2ed1 1587 server.stat_expiredkeys = 0;
ed9b544e 1588 server.stat_starttime = time(NULL);
3a66edc7 1589 server.unixtime = time(NULL);
d8f8b666 1590 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1591 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1592 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1593
1594 if (server.appendonly) {
71eba477 1595 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1596 if (server.appendfd == -1) {
1597 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1598 strerror(errno));
1599 exit(1);
1600 }
1601 }
75680a3c 1602
1603 if (server.vm_enabled) vmInit();
ed9b544e 1604}
1605
1606/* Empty the whole database */
ca37e9cd 1607static long long emptyDb() {
ed9b544e 1608 int j;
ca37e9cd 1609 long long removed = 0;
ed9b544e 1610
3305306f 1611 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1612 removed += dictSize(server.db[j].dict);
3305306f 1613 dictEmpty(server.db[j].dict);
1614 dictEmpty(server.db[j].expires);
1615 }
ca37e9cd 1616 return removed;
ed9b544e 1617}
1618
85dd2f3a 1619static int yesnotoi(char *s) {
1620 if (!strcasecmp(s,"yes")) return 1;
1621 else if (!strcasecmp(s,"no")) return 0;
1622 else return -1;
1623}
1624
ed9b544e 1625/* I agree, this is a very rudimental way to load a configuration...
1626 will improve later if the config gets more complex */
1627static void loadServerConfig(char *filename) {
c9a111ac 1628 FILE *fp;
ed9b544e 1629 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1630 int linenum = 0;
1631 sds line = NULL;
6bccf64a
AO
1632 char *errormsg = "Fatal error, can't open config file '%s'";
1633 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1634 sprintf(errorbuf, errormsg, filename);
c9a111ac 1635
1636 if (filename[0] == '-' && filename[1] == '\0')
1637 fp = stdin;
1638 else {
1639 if ((fp = fopen(filename,"r")) == NULL) {
6bccf64a 1640 redisLog(REDIS_WARNING, errorbuf);
c9a111ac 1641 exit(1);
1642 }
ed9b544e 1643 }
c9a111ac 1644
ed9b544e 1645 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1646 sds *argv;
1647 int argc, j;
1648
1649 linenum++;
1650 line = sdsnew(buf);
1651 line = sdstrim(line," \t\r\n");
1652
1653 /* Skip comments and blank lines*/
1654 if (line[0] == '#' || line[0] == '\0') {
1655 sdsfree(line);
1656 continue;
1657 }
1658
1659 /* Split into arguments */
1660 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1661 sdstolower(argv[0]);
1662
1663 /* Execute config directives */
bb0b03a3 1664 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1665 server.maxidletime = atoi(argv[1]);
0150db36 1666 if (server.maxidletime < 0) {
ed9b544e 1667 err = "Invalid timeout value"; goto loaderr;
1668 }
bb0b03a3 1669 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1670 server.port = atoi(argv[1]);
1671 if (server.port < 1 || server.port > 65535) {
1672 err = "Invalid port"; goto loaderr;
1673 }
bb0b03a3 1674 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1675 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1676 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1677 int seconds = atoi(argv[1]);
1678 int changes = atoi(argv[2]);
1679 if (seconds < 1 || changes < 0) {
1680 err = "Invalid save parameters"; goto loaderr;
1681 }
1682 appendServerSaveParams(seconds,changes);
bb0b03a3 1683 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1684 if (chdir(argv[1]) == -1) {
1685 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1686 argv[1], strerror(errno));
1687 exit(1);
1688 }
bb0b03a3 1689 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1690 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1691 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1692 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1693 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1694 else {
1695 err = "Invalid log level. Must be one of debug, notice, warning";
1696 goto loaderr;
1697 }
bb0b03a3 1698 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1699 FILE *logfp;
ed9b544e 1700
1701 server.logfile = zstrdup(argv[1]);
bb0b03a3 1702 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1703 zfree(server.logfile);
1704 server.logfile = NULL;
1705 }
1706 if (server.logfile) {
1707 /* Test if we are able to open the file. The server will not
1708 * be able to abort just for this problem later... */
c9a111ac 1709 logfp = fopen(server.logfile,"a");
1710 if (logfp == NULL) {
ed9b544e 1711 err = sdscatprintf(sdsempty(),
1712 "Can't open the log file: %s", strerror(errno));
1713 goto loaderr;
1714 }
c9a111ac 1715 fclose(logfp);
ed9b544e 1716 }
bb0b03a3 1717 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1718 server.dbnum = atoi(argv[1]);
1719 if (server.dbnum < 1) {
1720 err = "Invalid number of databases"; goto loaderr;
1721 }
b3f83f12
JZ
1722 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1723 loadServerConfig(argv[1]);
285add55 1724 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1725 server.maxclients = atoi(argv[1]);
3fd78bcd 1726 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1727 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1728 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1729 server.masterhost = sdsnew(argv[1]);
1730 server.masterport = atoi(argv[2]);
1731 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1732 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1733 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1734 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1735 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1736 err = "argument must be 'yes' or 'no'"; goto loaderr;
1737 }
bb0b03a3 1738 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1739 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1740 err = "argument must be 'yes' or 'no'"; goto loaderr;
1741 }
121f70cf 1742 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1743 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1744 err = "argument must be 'yes' or 'no'"; goto loaderr;
1745 }
e52c65b9 1746 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1747 server.sharingpoolsize = atoi(argv[1]);
1748 if (server.sharingpoolsize < 1) {
1749 err = "invalid object sharing pool size"; goto loaderr;
1750 }
bb0b03a3 1751 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1752 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1753 err = "argument must be 'yes' or 'no'"; goto loaderr;
1754 }
44b38ef4 1755 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1756 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1757 err = "argument must be 'yes' or 'no'"; goto loaderr;
1758 }
48f0308a 1759 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1760 if (!strcasecmp(argv[1],"no")) {
48f0308a 1761 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1762 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1763 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1764 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1765 server.appendfsync = APPENDFSYNC_EVERYSEC;
1766 } else {
1767 err = "argument must be 'no', 'always' or 'everysec'";
1768 goto loaderr;
1769 }
bb0b03a3 1770 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1771 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1772 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1773 zfree(server.pidfile);
054e426d 1774 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1775 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1776 zfree(server.dbfilename);
054e426d 1777 server.dbfilename = zstrdup(argv[1]);
75680a3c 1778 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1779 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1780 err = "argument must be 'yes' or 'no'"; goto loaderr;
1781 }
054e426d 1782 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1783 zfree(server.vm_swap_file);
054e426d 1784 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1785 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1786 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1787 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1788 server.vm_page_size = strtoll(argv[1], NULL, 10);
1789 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1790 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1791 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1792 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1793 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1794 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1795 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1796 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1797 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1798 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1799 } else {
1800 err = "Bad directive or wrong number of arguments"; goto loaderr;
1801 }
1802 for (j = 0; j < argc; j++)
1803 sdsfree(argv[j]);
1804 zfree(argv);
1805 sdsfree(line);
1806 }
c9a111ac 1807 if (fp != stdin) fclose(fp);
ed9b544e 1808 return;
1809
1810loaderr:
1811 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1812 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1813 fprintf(stderr, ">>> '%s'\n", line);
1814 fprintf(stderr, "%s\n", err);
1815 exit(1);
1816}
1817
1818static void freeClientArgv(redisClient *c) {
1819 int j;
1820
1821 for (j = 0; j < c->argc; j++)
1822 decrRefCount(c->argv[j]);
e8a74421 1823 for (j = 0; j < c->mbargc; j++)
1824 decrRefCount(c->mbargv[j]);
ed9b544e 1825 c->argc = 0;
e8a74421 1826 c->mbargc = 0;
ed9b544e 1827}
1828
1829static void freeClient(redisClient *c) {
1830 listNode *ln;
1831
4409877e 1832 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1833 * call, we have to set querybuf to NULL *before* to call
1834 * unblockClientWaitingData() to avoid processInputBuffer() will get
1835 * called. Also it is important to remove the file events after
1836 * this, because this call adds the READABLE event. */
4409877e 1837 sdsfree(c->querybuf);
1838 c->querybuf = NULL;
1839 if (c->flags & REDIS_BLOCKED)
b0d8747d 1840 unblockClientWaitingData(c);
4409877e 1841
ed9b544e 1842 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1843 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1844 listRelease(c->reply);
1845 freeClientArgv(c);
1846 close(c->fd);
92f8e882 1847 /* Remove from the list of clients */
ed9b544e 1848 ln = listSearchKey(server.clients,c);
dfc5e96c 1849 redisAssert(ln != NULL);
ed9b544e 1850 listDelNode(server.clients,ln);
d5d55fc3 1851 /* Remove from the list of clients waiting for swapped keys */
1852 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1853 ln = listSearchKey(server.io_ready_clients,c);
1854 if (ln) {
1855 listDelNode(server.io_ready_clients,ln);
1856 server.vm_blocked_clients--;
1857 }
1858 }
1859 while (server.vm_enabled && listLength(c->io_keys)) {
1860 ln = listFirst(c->io_keys);
1861 dontWaitForSwappedKey(c,ln->value);
92f8e882 1862 }
b3e3d0d7 1863 listRelease(c->io_keys);
92f8e882 1864 /* Other cleanup */
ed9b544e 1865 if (c->flags & REDIS_SLAVE) {
6208b3a7 1866 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1867 close(c->repldbfd);
87eca727 1868 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1869 ln = listSearchKey(l,c);
dfc5e96c 1870 redisAssert(ln != NULL);
87eca727 1871 listDelNode(l,ln);
ed9b544e 1872 }
1873 if (c->flags & REDIS_MASTER) {
1874 server.master = NULL;
1875 server.replstate = REDIS_REPL_CONNECT;
1876 }
93ea3759 1877 zfree(c->argv);
e8a74421 1878 zfree(c->mbargv);
6e469882 1879 freeClientMultiState(c);
ed9b544e 1880 zfree(c);
1881}
1882
cc30e368 1883#define GLUEREPLY_UP_TO (1024)
ed9b544e 1884static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1885 int copylen = 0;
1886 char buf[GLUEREPLY_UP_TO];
6208b3a7 1887 listNode *ln;
c7df85a4 1888 listIter li;
ed9b544e 1889 robj *o;
1890
c7df85a4 1891 listRewind(c->reply,&li);
1892 while((ln = listNext(&li))) {
c28b42ac 1893 int objlen;
1894
ed9b544e 1895 o = ln->value;
c28b42ac 1896 objlen = sdslen(o->ptr);
1897 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1898 memcpy(buf+copylen,o->ptr,objlen);
1899 copylen += objlen;
ed9b544e 1900 listDelNode(c->reply,ln);
c28b42ac 1901 } else {
1902 if (copylen == 0) return;
1903 break;
ed9b544e 1904 }
ed9b544e 1905 }
c28b42ac 1906 /* Now the output buffer is empty, add the new single element */
1907 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1908 listAddNodeHead(c->reply,o);
ed9b544e 1909}
1910
1911static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1912 redisClient *c = privdata;
1913 int nwritten = 0, totwritten = 0, objlen;
1914 robj *o;
1915 REDIS_NOTUSED(el);
1916 REDIS_NOTUSED(mask);
1917
2895e862 1918 /* Use writev() if we have enough buffers to send */
7ea870c0 1919 if (!server.glueoutputbuf &&
1920 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1921 !(c->flags & REDIS_MASTER))
2895e862 1922 {
1923 sendReplyToClientWritev(el, fd, privdata, mask);
1924 return;
1925 }
2895e862 1926
ed9b544e 1927 while(listLength(c->reply)) {
c28b42ac 1928 if (server.glueoutputbuf && listLength(c->reply) > 1)
1929 glueReplyBuffersIfNeeded(c);
1930
ed9b544e 1931 o = listNodeValue(listFirst(c->reply));
1932 objlen = sdslen(o->ptr);
1933
1934 if (objlen == 0) {
1935 listDelNode(c->reply,listFirst(c->reply));
1936 continue;
1937 }
1938
1939 if (c->flags & REDIS_MASTER) {
6f376729 1940 /* Don't reply to a master */
ed9b544e 1941 nwritten = objlen - c->sentlen;
1942 } else {
a4d1ba9a 1943 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 1944 if (nwritten <= 0) break;
1945 }
1946 c->sentlen += nwritten;
1947 totwritten += nwritten;
1948 /* If we fully sent the object on head go to the next one */
1949 if (c->sentlen == objlen) {
1950 listDelNode(c->reply,listFirst(c->reply));
1951 c->sentlen = 0;
1952 }
6f376729 1953 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 1954 * bytes, in a single threaded server it's a good idea to serve
6f376729 1955 * other clients as well, even if a very large request comes from
1956 * super fast link that is always able to accept data (in real world
12f9d551 1957 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 1958 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 1959 }
1960 if (nwritten == -1) {
1961 if (errno == EAGAIN) {
1962 nwritten = 0;
1963 } else {
f870935d 1964 redisLog(REDIS_VERBOSE,
ed9b544e 1965 "Error writing to client: %s", strerror(errno));
1966 freeClient(c);
1967 return;
1968 }
1969 }
1970 if (totwritten > 0) c->lastinteraction = time(NULL);
1971 if (listLength(c->reply) == 0) {
1972 c->sentlen = 0;
1973 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1974 }
1975}
1976
2895e862 1977static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1978{
1979 redisClient *c = privdata;
1980 int nwritten = 0, totwritten = 0, objlen, willwrite;
1981 robj *o;
1982 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1983 int offset, ion = 0;
1984 REDIS_NOTUSED(el);
1985 REDIS_NOTUSED(mask);
1986
1987 listNode *node;
1988 while (listLength(c->reply)) {
1989 offset = c->sentlen;
1990 ion = 0;
1991 willwrite = 0;
1992
1993 /* fill-in the iov[] array */
1994 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1995 o = listNodeValue(node);
1996 objlen = sdslen(o->ptr);
1997
1998 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1999 break;
2000
2001 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2002 break; /* no more iovecs */
2003
2004 iov[ion].iov_base = ((char*)o->ptr) + offset;
2005 iov[ion].iov_len = objlen - offset;
2006 willwrite += objlen - offset;
2007 offset = 0; /* just for the first item */
2008 ion++;
2009 }
2010
2011 if(willwrite == 0)
2012 break;
2013
2014 /* write all collected blocks at once */
2015 if((nwritten = writev(fd, iov, ion)) < 0) {
2016 if (errno != EAGAIN) {
f870935d 2017 redisLog(REDIS_VERBOSE,
2895e862 2018 "Error writing to client: %s", strerror(errno));
2019 freeClient(c);
2020 return;
2021 }
2022 break;
2023 }
2024
2025 totwritten += nwritten;
2026 offset = c->sentlen;
2027
2028 /* remove written robjs from c->reply */
2029 while (nwritten && listLength(c->reply)) {
2030 o = listNodeValue(listFirst(c->reply));
2031 objlen = sdslen(o->ptr);
2032
2033 if(nwritten >= objlen - offset) {
2034 listDelNode(c->reply, listFirst(c->reply));
2035 nwritten -= objlen - offset;
2036 c->sentlen = 0;
2037 } else {
2038 /* partial write */
2039 c->sentlen += nwritten;
2040 break;
2041 }
2042 offset = 0;
2043 }
2044 }
2045
2046 if (totwritten > 0)
2047 c->lastinteraction = time(NULL);
2048
2049 if (listLength(c->reply) == 0) {
2050 c->sentlen = 0;
2051 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2052 }
2053}
2054
ed9b544e 2055static struct redisCommand *lookupCommand(char *name) {
2056 int j = 0;
2057 while(cmdTable[j].name != NULL) {
bb0b03a3 2058 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2059 j++;
2060 }
2061 return NULL;
2062}
2063
2064/* resetClient prepare the client to process the next command */
2065static void resetClient(redisClient *c) {
2066 freeClientArgv(c);
2067 c->bulklen = -1;
e8a74421 2068 c->multibulk = 0;
ed9b544e 2069}
2070
6e469882 2071/* Call() is the core of Redis execution of a command */
2072static void call(redisClient *c, struct redisCommand *cmd) {
2073 long long dirty;
2074
2075 dirty = server.dirty;
2076 cmd->proc(c);
2077 if (server.appendonly && server.dirty-dirty)
2078 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2079 if (server.dirty-dirty && listLength(server.slaves))
248ea310 2080 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2081 if (listLength(server.monitors))
248ea310 2082 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2083 server.stat_numcommands++;
2084}
2085
ed9b544e 2086/* If this function gets called we already read a whole
2087 * command, argments are in the client argv/argc fields.
2088 * processCommand() execute the command or prepare the
2089 * server for a bulk read from the client.
2090 *
2091 * If 1 is returned the client is still alive and valid and
2092 * and other operations can be performed by the caller. Otherwise
2093 * if 0 is returned the client was destroied (i.e. after QUIT). */
2094static int processCommand(redisClient *c) {
2095 struct redisCommand *cmd;
ed9b544e 2096
3fd78bcd 2097 /* Free some memory if needed (maxmemory setting) */
2098 if (server.maxmemory) freeMemoryIfNeeded();
2099
e8a74421 2100 /* Handle the multi bulk command type. This is an alternative protocol
2101 * supported by Redis in order to receive commands that are composed of
2102 * multiple binary-safe "bulk" arguments. The latency of processing is
2103 * a bit higher but this allows things like multi-sets, so if this
2104 * protocol is used only for MSET and similar commands this is a big win. */
2105 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2106 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2107 if (c->multibulk <= 0) {
2108 resetClient(c);
2109 return 1;
2110 } else {
2111 decrRefCount(c->argv[c->argc-1]);
2112 c->argc--;
2113 return 1;
2114 }
2115 } else if (c->multibulk) {
2116 if (c->bulklen == -1) {
2117 if (((char*)c->argv[0]->ptr)[0] != '$') {
2118 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2119 resetClient(c);
2120 return 1;
2121 } else {
2122 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2123 decrRefCount(c->argv[0]);
2124 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2125 c->argc--;
2126 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2127 resetClient(c);
2128 return 1;
2129 }
2130 c->argc--;
2131 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2132 return 1;
2133 }
2134 } else {
2135 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2136 c->mbargv[c->mbargc] = c->argv[0];
2137 c->mbargc++;
2138 c->argc--;
2139 c->multibulk--;
2140 if (c->multibulk == 0) {
2141 robj **auxargv;
2142 int auxargc;
2143
2144 /* Here we need to swap the multi-bulk argc/argv with the
2145 * normal argc/argv of the client structure. */
2146 auxargv = c->argv;
2147 c->argv = c->mbargv;
2148 c->mbargv = auxargv;
2149
2150 auxargc = c->argc;
2151 c->argc = c->mbargc;
2152 c->mbargc = auxargc;
2153
2154 /* We need to set bulklen to something different than -1
2155 * in order for the code below to process the command without
2156 * to try to read the last argument of a bulk command as
2157 * a special argument. */
2158 c->bulklen = 0;
2159 /* continue below and process the command */
2160 } else {
2161 c->bulklen = -1;
2162 return 1;
2163 }
2164 }
2165 }
2166 /* -- end of multi bulk commands processing -- */
2167
ed9b544e 2168 /* The QUIT command is handled as a special case. Normal command
2169 * procs are unable to close the client connection safely */
bb0b03a3 2170 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2171 freeClient(c);
2172 return 0;
2173 }
d5d55fc3 2174
2175 /* Now lookup the command and check ASAP about trivial error conditions
2176 * such wrong arity, bad command name and so forth. */
ed9b544e 2177 cmd = lookupCommand(c->argv[0]->ptr);
2178 if (!cmd) {
2c14807b 2179 addReplySds(c,
2180 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2181 (char*)c->argv[0]->ptr));
ed9b544e 2182 resetClient(c);
2183 return 1;
2184 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2185 (c->argc < -cmd->arity)) {
454d4e43 2186 addReplySds(c,
2187 sdscatprintf(sdsempty(),
2188 "-ERR wrong number of arguments for '%s' command\r\n",
2189 cmd->name));
ed9b544e 2190 resetClient(c);
2191 return 1;
2192 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2193 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2194 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2195
2196 decrRefCount(c->argv[c->argc-1]);
2197 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2198 c->argc--;
2199 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2200 resetClient(c);
2201 return 1;
2202 }
2203 c->argc--;
2204 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2205 /* It is possible that the bulk read is already in the
8d0490e7 2206 * buffer. Check this condition and handle it accordingly.
2207 * This is just a fast path, alternative to call processInputBuffer().
2208 * It's a good idea since the code is small and this condition
2209 * happens most of the times. */
ed9b544e 2210 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2211 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2212 c->argc++;
2213 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2214 } else {
d5d55fc3 2215 /* Otherwise return... there is to read the last argument
2216 * from the socket. */
ed9b544e 2217 return 1;
2218 }
2219 }
10c43610 2220 /* Let's try to share objects on the command arguments vector */
2221 if (server.shareobjects) {
2222 int j;
2223 for(j = 1; j < c->argc; j++)
2224 c->argv[j] = tryObjectSharing(c->argv[j]);
2225 }
942a3961 2226 /* Let's try to encode the bulk object to save space. */
2227 if (cmd->flags & REDIS_CMD_BULK)
2228 tryObjectEncoding(c->argv[c->argc-1]);
2229
e63943a4 2230 /* Check if the user is authenticated */
2231 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2232 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2233 resetClient(c);
2234 return 1;
2235 }
2236
b61a28fe 2237 /* Handle the maxmemory directive */
2238 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2239 zmalloc_used_memory() > server.maxmemory)
2240 {
2241 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2242 resetClient(c);
2243 return 1;
2244 }
2245
ed9b544e 2246 /* Exec the command */
18b6cb76 2247 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2248 queueMultiCommand(c,cmd);
2249 addReply(c,shared.queued);
2250 } else {
d5d55fc3 2251 if (server.vm_enabled && server.vm_max_threads > 0 &&
2252 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2253 call(c,cmd);
2254 }
ed9b544e 2255
2256 /* Prepare the client for the next command */
ed9b544e 2257 resetClient(c);
2258 return 1;
2259}
2260
248ea310 2261static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2262 listNode *ln;
c7df85a4 2263 listIter li;
ed9b544e 2264 int outc = 0, j;
93ea3759 2265 robj **outv;
248ea310 2266 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2267 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2268 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2269 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2270 robj *lenobj;
93ea3759 2271
2272 if (argc <= REDIS_STATIC_ARGS) {
2273 outv = static_outv;
2274 } else {
248ea310 2275 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2276 }
248ea310 2277
2278 lenobj = createObject(REDIS_STRING,
2279 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2280 lenobj->refcount = 0;
2281 outv[outc++] = lenobj;
ed9b544e 2282 for (j = 0; j < argc; j++) {
248ea310 2283 lenobj = createObject(REDIS_STRING,
2284 sdscatprintf(sdsempty(),"$%lu\r\n",
2285 (unsigned long) stringObjectLen(argv[j])));
2286 lenobj->refcount = 0;
2287 outv[outc++] = lenobj;
ed9b544e 2288 outv[outc++] = argv[j];
248ea310 2289 outv[outc++] = shared.crlf;
ed9b544e 2290 }
ed9b544e 2291
40d224a9 2292 /* Increment all the refcounts at start and decrement at end in order to
2293 * be sure to free objects if there is no slave in a replication state
2294 * able to be feed with commands */
2295 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2296 listRewind(slaves,&li);
2297 while((ln = listNext(&li))) {
ed9b544e 2298 redisClient *slave = ln->value;
40d224a9 2299
2300 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2301 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2302
2303 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2304 if (slave->slaveseldb != dictid) {
2305 robj *selectcmd;
2306
2307 switch(dictid) {
2308 case 0: selectcmd = shared.select0; break;
2309 case 1: selectcmd = shared.select1; break;
2310 case 2: selectcmd = shared.select2; break;
2311 case 3: selectcmd = shared.select3; break;
2312 case 4: selectcmd = shared.select4; break;
2313 case 5: selectcmd = shared.select5; break;
2314 case 6: selectcmd = shared.select6; break;
2315 case 7: selectcmd = shared.select7; break;
2316 case 8: selectcmd = shared.select8; break;
2317 case 9: selectcmd = shared.select9; break;
2318 default:
2319 selectcmd = createObject(REDIS_STRING,
2320 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2321 selectcmd->refcount = 0;
2322 break;
2323 }
2324 addReply(slave,selectcmd);
2325 slave->slaveseldb = dictid;
2326 }
2327 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2328 }
40d224a9 2329 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2330 if (outv != static_outv) zfree(outv);
ed9b544e 2331}
2332
638e42ac 2333static void processInputBuffer(redisClient *c) {
ed9b544e 2334again:
4409877e 2335 /* Before to process the input buffer, make sure the client is not
2336 * waitig for a blocking operation such as BLPOP. Note that the first
2337 * iteration the client is never blocked, otherwise the processInputBuffer
2338 * would not be called at all, but after the execution of the first commands
2339 * in the input buffer the client may be blocked, and the "goto again"
2340 * will try to reiterate. The following line will make it return asap. */
92f8e882 2341 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2342 if (c->bulklen == -1) {
2343 /* Read the first line of the query */
2344 char *p = strchr(c->querybuf,'\n');
2345 size_t querylen;
644fafa3 2346
ed9b544e 2347 if (p) {
2348 sds query, *argv;
2349 int argc, j;
2350
2351 query = c->querybuf;
2352 c->querybuf = sdsempty();
2353 querylen = 1+(p-(query));
2354 if (sdslen(query) > querylen) {
2355 /* leave data after the first line of the query in the buffer */
2356 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2357 }
2358 *p = '\0'; /* remove "\n" */
2359 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2360 sdsupdatelen(query);
2361
2362 /* Now we can split the query in arguments */
ed9b544e 2363 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2364 sdsfree(query);
2365
2366 if (c->argv) zfree(c->argv);
2367 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2368
2369 for (j = 0; j < argc; j++) {
ed9b544e 2370 if (sdslen(argv[j])) {
2371 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2372 c->argc++;
2373 } else {
2374 sdsfree(argv[j]);
2375 }
2376 }
2377 zfree(argv);
7c49733c 2378 if (c->argc) {
2379 /* Execute the command. If the client is still valid
2380 * after processCommand() return and there is something
2381 * on the query buffer try to process the next command. */
2382 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2383 } else {
2384 /* Nothing to process, argc == 0. Just process the query
2385 * buffer if it's not empty or return to the caller */
2386 if (sdslen(c->querybuf)) goto again;
2387 }
ed9b544e 2388 return;
644fafa3 2389 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2390 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2391 freeClient(c);
2392 return;
2393 }
2394 } else {
2395 /* Bulk read handling. Note that if we are at this point
2396 the client already sent a command terminated with a newline,
2397 we are reading the bulk data that is actually the last
2398 argument of the command. */
2399 int qbl = sdslen(c->querybuf);
2400
2401 if (c->bulklen <= qbl) {
2402 /* Copy everything but the final CRLF as final argument */
2403 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2404 c->argc++;
2405 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2406 /* Process the command. If the client is still valid after
2407 * the processing and there is more data in the buffer
2408 * try to parse it. */
2409 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2410 return;
2411 }
2412 }
2413}
2414
638e42ac 2415static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2416 redisClient *c = (redisClient*) privdata;
2417 char buf[REDIS_IOBUF_LEN];
2418 int nread;
2419 REDIS_NOTUSED(el);
2420 REDIS_NOTUSED(mask);
2421
2422 nread = read(fd, buf, REDIS_IOBUF_LEN);
2423 if (nread == -1) {
2424 if (errno == EAGAIN) {
2425 nread = 0;
2426 } else {
f870935d 2427 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2428 freeClient(c);
2429 return;
2430 }
2431 } else if (nread == 0) {
f870935d 2432 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2433 freeClient(c);
2434 return;
2435 }
2436 if (nread) {
2437 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2438 c->lastinteraction = time(NULL);
2439 } else {
2440 return;
2441 }
168ac5c6 2442 processInputBuffer(c);
638e42ac 2443}
2444
ed9b544e 2445static int selectDb(redisClient *c, int id) {
2446 if (id < 0 || id >= server.dbnum)
2447 return REDIS_ERR;
3305306f 2448 c->db = &server.db[id];
ed9b544e 2449 return REDIS_OK;
2450}
2451
40d224a9 2452static void *dupClientReplyValue(void *o) {
2453 incrRefCount((robj*)o);
12d090d2 2454 return o;
40d224a9 2455}
2456
ed9b544e 2457static redisClient *createClient(int fd) {
2458 redisClient *c = zmalloc(sizeof(*c));
2459
2460 anetNonBlock(NULL,fd);
2461 anetTcpNoDelay(NULL,fd);
2462 if (!c) return NULL;
2463 selectDb(c,0);
2464 c->fd = fd;
2465 c->querybuf = sdsempty();
2466 c->argc = 0;
93ea3759 2467 c->argv = NULL;
ed9b544e 2468 c->bulklen = -1;
e8a74421 2469 c->multibulk = 0;
2470 c->mbargc = 0;
2471 c->mbargv = NULL;
ed9b544e 2472 c->sentlen = 0;
2473 c->flags = 0;
2474 c->lastinteraction = time(NULL);
abcb223e 2475 c->authenticated = 0;
40d224a9 2476 c->replstate = REDIS_REPL_NONE;
6b47e12e 2477 c->reply = listCreate();
ed9b544e 2478 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2479 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2480 c->blockingkeys = NULL;
2481 c->blockingkeysnum = 0;
2482 c->io_keys = listCreate();
2483 listSetFreeMethod(c->io_keys,decrRefCount);
ed9b544e 2484 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2485 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2486 freeClient(c);
2487 return NULL;
2488 }
6b47e12e 2489 listAddNodeTail(server.clients,c);
6e469882 2490 initClientMultiState(c);
ed9b544e 2491 return c;
2492}
2493
2494static void addReply(redisClient *c, robj *obj) {
2495 if (listLength(c->reply) == 0 &&
6208b3a7 2496 (c->replstate == REDIS_REPL_NONE ||
2497 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2498 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2499 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2500
2501 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2502 obj = dupStringObject(obj);
2503 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2504 }
9d65a1bb 2505 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2506}
2507
2508static void addReplySds(redisClient *c, sds s) {
2509 robj *o = createObject(REDIS_STRING,s);
2510 addReply(c,o);
2511 decrRefCount(o);
2512}
2513
e2665397 2514static void addReplyDouble(redisClient *c, double d) {
2515 char buf[128];
2516
2517 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2518 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2519 (unsigned long) strlen(buf),buf));
e2665397 2520}
2521
f44dd428 2522static void addReplyLong(redisClient *c, long l) {
2523 char buf[128];
2524 size_t len;
2525
dd88747b 2526 if (l == 0) {
2527 addReply(c,shared.czero);
2528 return;
2529 } else if (l == 1) {
2530 addReply(c,shared.cone);
2531 return;
2532 }
f44dd428 2533 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2534 addReplySds(c,sdsnewlen(buf,len));
2535}
2536
92b27fe9 2537static void addReplyUlong(redisClient *c, unsigned long ul) {
2538 char buf[128];
2539 size_t len;
2540
dd88747b 2541 if (ul == 0) {
2542 addReply(c,shared.czero);
2543 return;
2544 } else if (ul == 1) {
2545 addReply(c,shared.cone);
2546 return;
2547 }
92b27fe9 2548 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2549 addReplySds(c,sdsnewlen(buf,len));
2550}
2551
942a3961 2552static void addReplyBulkLen(redisClient *c, robj *obj) {
2553 size_t len;
2554
2555 if (obj->encoding == REDIS_ENCODING_RAW) {
2556 len = sdslen(obj->ptr);
2557 } else {
2558 long n = (long)obj->ptr;
2559
e054afda 2560 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2561 len = 1;
2562 if (n < 0) {
2563 len++;
2564 n = -n;
2565 }
2566 while((n = n/10) != 0) {
2567 len++;
2568 }
2569 }
83c6a618 2570 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2571}
2572
dd88747b 2573static void addReplyBulk(redisClient *c, robj *obj) {
2574 addReplyBulkLen(c,obj);
2575 addReply(c,obj);
2576 addReply(c,shared.crlf);
2577}
2578
500ece7c 2579/* In the CONFIG command we need to add vanilla C string as bulk replies */
2580static void addReplyBulkCString(redisClient *c, char *s) {
2581 if (s == NULL) {
2582 addReply(c,shared.nullbulk);
2583 } else {
2584 robj *o = createStringObject(s,strlen(s));
2585 addReplyBulk(c,o);
2586 decrRefCount(o);
2587 }
2588}
2589
ed9b544e 2590static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2591 int cport, cfd;
2592 char cip[128];
285add55 2593 redisClient *c;
ed9b544e 2594 REDIS_NOTUSED(el);
2595 REDIS_NOTUSED(mask);
2596 REDIS_NOTUSED(privdata);
2597
2598 cfd = anetAccept(server.neterr, fd, cip, &cport);
2599 if (cfd == AE_ERR) {
f870935d 2600 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2601 return;
2602 }
f870935d 2603 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2604 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2605 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2606 close(cfd); /* May be already closed, just ingore errors */
2607 return;
2608 }
285add55 2609 /* If maxclient directive is set and this is one client more... close the
2610 * connection. Note that we create the client instead to check before
2611 * for this condition, since now the socket is already set in nonblocking
2612 * mode and we can send an error for free using the Kernel I/O */
2613 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2614 char *err = "-ERR max number of clients reached\r\n";
2615
2616 /* That's a best effort error message, don't check write errors */
fee803ba 2617 if (write(c->fd,err,strlen(err)) == -1) {
2618 /* Nothing to do, Just to avoid the warning... */
2619 }
285add55 2620 freeClient(c);
2621 return;
2622 }
ed9b544e 2623 server.stat_numconnections++;
2624}
2625
2626/* ======================= Redis objects implementation ===================== */
2627
2628static robj *createObject(int type, void *ptr) {
2629 robj *o;
2630
a5819310 2631 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2632 if (listLength(server.objfreelist)) {
2633 listNode *head = listFirst(server.objfreelist);
2634 o = listNodeValue(head);
2635 listDelNode(server.objfreelist,head);
a5819310 2636 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2637 } else {
75680a3c 2638 if (server.vm_enabled) {
a5819310 2639 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2640 o = zmalloc(sizeof(*o));
2641 } else {
2642 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2643 }
ed9b544e 2644 }
ed9b544e 2645 o->type = type;
942a3961 2646 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2647 o->ptr = ptr;
2648 o->refcount = 1;
3a66edc7 2649 if (server.vm_enabled) {
1064ef87 2650 /* Note that this code may run in the context of an I/O thread
2651 * and accessing to server.unixtime in theory is an error
2652 * (no locks). But in practice this is safe, and even if we read
2653 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2654 o->vm.atime = server.unixtime;
2655 o->storage = REDIS_VM_MEMORY;
2656 }
ed9b544e 2657 return o;
2658}
2659
2660static robj *createStringObject(char *ptr, size_t len) {
2661 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2662}
2663
4ef8de8a 2664static robj *dupStringObject(robj *o) {
b9bc0eef 2665 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2666 return createStringObject(o->ptr,sdslen(o->ptr));
2667}
2668
ed9b544e 2669static robj *createListObject(void) {
2670 list *l = listCreate();
2671
ed9b544e 2672 listSetFreeMethod(l,decrRefCount);
2673 return createObject(REDIS_LIST,l);
2674}
2675
2676static robj *createSetObject(void) {
2677 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2678 return createObject(REDIS_SET,d);
2679}
2680
5234952b 2681static robj *createHashObject(void) {
2682 /* All the Hashes start as zipmaps. Will be automatically converted
2683 * into hash tables if there are enough elements or big elements
2684 * inside. */
2685 unsigned char *zm = zipmapNew();
2686 robj *o = createObject(REDIS_HASH,zm);
2687 o->encoding = REDIS_ENCODING_ZIPMAP;
2688 return o;
2689}
2690
1812e024 2691static robj *createZsetObject(void) {
6b47e12e 2692 zset *zs = zmalloc(sizeof(*zs));
2693
2694 zs->dict = dictCreate(&zsetDictType,NULL);
2695 zs->zsl = zslCreate();
2696 return createObject(REDIS_ZSET,zs);
1812e024 2697}
2698
ed9b544e 2699static void freeStringObject(robj *o) {
942a3961 2700 if (o->encoding == REDIS_ENCODING_RAW) {
2701 sdsfree(o->ptr);
2702 }
ed9b544e 2703}
2704
2705static void freeListObject(robj *o) {
2706 listRelease((list*) o->ptr);
2707}
2708
2709static void freeSetObject(robj *o) {
2710 dictRelease((dict*) o->ptr);
2711}
2712
fd8ccf44 2713static void freeZsetObject(robj *o) {
2714 zset *zs = o->ptr;
2715
2716 dictRelease(zs->dict);
2717 zslFree(zs->zsl);
2718 zfree(zs);
2719}
2720
ed9b544e 2721static void freeHashObject(robj *o) {
cbba7dd7 2722 switch (o->encoding) {
2723 case REDIS_ENCODING_HT:
2724 dictRelease((dict*) o->ptr);
2725 break;
2726 case REDIS_ENCODING_ZIPMAP:
2727 zfree(o->ptr);
2728 break;
2729 default:
2730 redisAssert(0);
2731 break;
2732 }
ed9b544e 2733}
2734
2735static void incrRefCount(robj *o) {
f2b8ab34 2736 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
ed9b544e 2737 o->refcount++;
2738}
2739
2740static void decrRefCount(void *obj) {
2741 robj *o = obj;
94754ccc 2742
970e10bb 2743 /* Object is a key of a swapped out value, or in the process of being
2744 * loaded. */
996cb5f7 2745 if (server.vm_enabled &&
2746 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2747 {
2748 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2749 redisAssert(o->refcount == 1);
2750 }
2751 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2752 redisAssert(o->type == REDIS_STRING);
a35ddf12 2753 freeStringObject(o);
2754 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2755 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2756 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2757 !listAddNodeHead(server.objfreelist,o))
2758 zfree(o);
a5819310 2759 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2760 server.vm_stats_swapped_objects--;
a35ddf12 2761 return;
2762 }
996cb5f7 2763 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2764 if (--(o->refcount) == 0) {
996cb5f7 2765 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2766 vmCancelThreadedIOJob(obj);
ed9b544e 2767 switch(o->type) {
2768 case REDIS_STRING: freeStringObject(o); break;
2769 case REDIS_LIST: freeListObject(o); break;
2770 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2771 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2772 case REDIS_HASH: freeHashObject(o); break;
78409a0f 2773 default: redisAssert(0); break;
ed9b544e 2774 }
a5819310 2775 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2776 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2777 !listAddNodeHead(server.objfreelist,o))
2778 zfree(o);
a5819310 2779 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2780 }
2781}
2782
942a3961 2783static robj *lookupKey(redisDb *db, robj *key) {
2784 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2785 if (de) {
55cf8433 2786 robj *key = dictGetEntryKey(de);
2787 robj *val = dictGetEntryVal(de);
3a66edc7 2788
55cf8433 2789 if (server.vm_enabled) {
996cb5f7 2790 if (key->storage == REDIS_VM_MEMORY ||
2791 key->storage == REDIS_VM_SWAPPING)
2792 {
2793 /* If we were swapping the object out, stop it, this key
2794 * was requested. */
2795 if (key->storage == REDIS_VM_SWAPPING)
2796 vmCancelThreadedIOJob(key);
55cf8433 2797 /* Update the access time of the key for the aging algorithm. */
2798 key->vm.atime = server.unixtime;
2799 } else {
d5d55fc3 2800 int notify = (key->storage == REDIS_VM_LOADING);
2801
55cf8433 2802 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2803 redisAssert(val == NULL);
55cf8433 2804 val = vmLoadObject(key);
2805 dictGetEntryVal(de) = val;
d5d55fc3 2806
2807 /* Clients blocked by the VM subsystem may be waiting for
2808 * this key... */
2809 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 2810 }
2811 }
2812 return val;
3a66edc7 2813 } else {
2814 return NULL;
2815 }
942a3961 2816}
2817
2818static robj *lookupKeyRead(redisDb *db, robj *key) {
2819 expireIfNeeded(db,key);
2820 return lookupKey(db,key);
2821}
2822
2823static robj *lookupKeyWrite(redisDb *db, robj *key) {
2824 deleteIfVolatile(db,key);
2825 return lookupKey(db,key);
2826}
2827
92b27fe9 2828static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2829 robj *o = lookupKeyRead(c->db, key);
2830 if (!o) addReply(c,reply);
2831 return o;
2832}
2833
2834static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2835 robj *o = lookupKeyWrite(c->db, key);
2836 if (!o) addReply(c,reply);
2837 return o;
2838}
2839
2840static int checkType(redisClient *c, robj *o, int type) {
2841 if (o->type != type) {
2842 addReply(c,shared.wrongtypeerr);
2843 return 1;
2844 }
2845 return 0;
2846}
2847
942a3961 2848static int deleteKey(redisDb *db, robj *key) {
2849 int retval;
2850
2851 /* We need to protect key from destruction: after the first dictDelete()
2852 * it may happen that 'key' is no longer valid if we don't increment
2853 * it's count. This may happen when we get the object reference directly
2854 * from the hash table with dictRandomKey() or dict iterators */
2855 incrRefCount(key);
2856 if (dictSize(db->expires)) dictDelete(db->expires,key);
2857 retval = dictDelete(db->dict,key);
2858 decrRefCount(key);
2859
2860 return retval == DICT_OK;
2861}
2862
10c43610 2863/* Try to share an object against the shared objects pool */
2864static robj *tryObjectSharing(robj *o) {
2865 struct dictEntry *de;
2866 unsigned long c;
2867
3305306f 2868 if (o == NULL || server.shareobjects == 0) return o;
10c43610 2869
dfc5e96c 2870 redisAssert(o->type == REDIS_STRING);
10c43610 2871 de = dictFind(server.sharingpool,o);
2872 if (de) {
2873 robj *shared = dictGetEntryKey(de);
2874
2875 c = ((unsigned long) dictGetEntryVal(de))+1;
2876 dictGetEntryVal(de) = (void*) c;
2877 incrRefCount(shared);
2878 decrRefCount(o);
2879 return shared;
2880 } else {
2881 /* Here we are using a stream algorihtm: Every time an object is
2882 * shared we increment its count, everytime there is a miss we
2883 * recrement the counter of a random object. If this object reaches
2884 * zero we remove the object and put the current object instead. */
3305306f 2885 if (dictSize(server.sharingpool) >=
10c43610 2886 server.sharingpoolsize) {
2887 de = dictGetRandomKey(server.sharingpool);
dfc5e96c 2888 redisAssert(de != NULL);
10c43610 2889 c = ((unsigned long) dictGetEntryVal(de))-1;
2890 dictGetEntryVal(de) = (void*) c;
2891 if (c == 0) {
2892 dictDelete(server.sharingpool,de->key);
2893 }
2894 } else {
2895 c = 0; /* If the pool is empty we want to add this object */
2896 }
2897 if (c == 0) {
2898 int retval;
2899
2900 retval = dictAdd(server.sharingpool,o,(void*)1);
dfc5e96c 2901 redisAssert(retval == DICT_OK);
10c43610 2902 incrRefCount(o);
2903 }
2904 return o;
2905 }
2906}
2907
724a51b1 2908/* Check if the nul-terminated string 's' can be represented by a long
2909 * (that is, is a number that fits into long without any other space or
2910 * character before or after the digits).
2911 *
2912 * If so, the function returns REDIS_OK and *longval is set to the value
2913 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2914static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2915 char buf[32], *endptr;
2916 long value;
2917 int slen;
2918
2919 value = strtol(s, &endptr, 10);
2920 if (endptr[0] != '\0') return REDIS_ERR;
2921 slen = snprintf(buf,32,"%ld",value);
2922
2923 /* If the number converted back into a string is not identical
2924 * then it's not possible to encode the string as integer */
f69f2cba 2925 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2926 if (longval) *longval = value;
2927 return REDIS_OK;
2928}
2929
942a3961 2930/* Try to encode a string object in order to save space */
2931static int tryObjectEncoding(robj *o) {
2932 long value;
942a3961 2933 sds s = o->ptr;
3305306f 2934
942a3961 2935 if (o->encoding != REDIS_ENCODING_RAW)
2936 return REDIS_ERR; /* Already encoded */
3305306f 2937
942a3961 2938 /* It's not save to encode shared objects: shared objects can be shared
2939 * everywhere in the "object space" of Redis. Encoded objects can only
2940 * appear as "values" (and not, for instance, as keys) */
2941 if (o->refcount > 1) return REDIS_ERR;
3305306f 2942
942a3961 2943 /* Currently we try to encode only strings */
dfc5e96c 2944 redisAssert(o->type == REDIS_STRING);
94754ccc 2945
724a51b1 2946 /* Check if we can represent this string as a long integer */
2947 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
942a3961 2948
2949 /* Ok, this object can be encoded */
2950 o->encoding = REDIS_ENCODING_INT;
2951 sdsfree(o->ptr);
2952 o->ptr = (void*) value;
2953 return REDIS_OK;
2954}
2955
9d65a1bb 2956/* Get a decoded version of an encoded object (returned as a new object).
2957 * If the object is already raw-encoded just increment the ref count. */
2958static robj *getDecodedObject(robj *o) {
942a3961 2959 robj *dec;
2960
9d65a1bb 2961 if (o->encoding == REDIS_ENCODING_RAW) {
2962 incrRefCount(o);
2963 return o;
2964 }
942a3961 2965 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2966 char buf[32];
2967
2968 snprintf(buf,32,"%ld",(long)o->ptr);
2969 dec = createStringObject(buf,strlen(buf));
2970 return dec;
2971 } else {
dfc5e96c 2972 redisAssert(1 != 1);
942a3961 2973 }
3305306f 2974}
2975
d7f43c08 2976/* Compare two string objects via strcmp() or alike.
2977 * Note that the objects may be integer-encoded. In such a case we
2978 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 2979 * and compare the strings, it's much faster than calling getDecodedObject().
2980 *
2981 * Important note: if objects are not integer encoded, but binary-safe strings,
2982 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2983 * binary safe. */
724a51b1 2984static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 2985 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 2986 char bufa[128], bufb[128], *astr, *bstr;
2987 int bothsds = 1;
724a51b1 2988
e197b441 2989 if (a == b) return 0;
d7f43c08 2990 if (a->encoding != REDIS_ENCODING_RAW) {
2991 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2992 astr = bufa;
2993 bothsds = 0;
724a51b1 2994 } else {
d7f43c08 2995 astr = a->ptr;
724a51b1 2996 }
d7f43c08 2997 if (b->encoding != REDIS_ENCODING_RAW) {
2998 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2999 bstr = bufb;
3000 bothsds = 0;
3001 } else {
3002 bstr = b->ptr;
3003 }
3004 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3005}
3006
0ea663ea 3007static size_t stringObjectLen(robj *o) {
dfc5e96c 3008 redisAssert(o->type == REDIS_STRING);
0ea663ea 3009 if (o->encoding == REDIS_ENCODING_RAW) {
3010 return sdslen(o->ptr);
3011 } else {
3012 char buf[32];
3013
3014 return snprintf(buf,32,"%ld",(long)o->ptr);
3015 }
3016}
3017
06233c45 3018/*============================ RDB saving/loading =========================== */
ed9b544e 3019
f78fd11b 3020static int rdbSaveType(FILE *fp, unsigned char type) {
3021 if (fwrite(&type,1,1,fp) == 0) return -1;
3022 return 0;
3023}
3024
bb32ede5 3025static int rdbSaveTime(FILE *fp, time_t t) {
3026 int32_t t32 = (int32_t) t;
3027 if (fwrite(&t32,4,1,fp) == 0) return -1;
3028 return 0;
3029}
3030
e3566d4b 3031/* check rdbLoadLen() comments for more info */
f78fd11b 3032static int rdbSaveLen(FILE *fp, uint32_t len) {
3033 unsigned char buf[2];
3034
3035 if (len < (1<<6)) {
3036 /* Save a 6 bit len */
10c43610 3037 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3038 if (fwrite(buf,1,1,fp) == 0) return -1;
3039 } else if (len < (1<<14)) {
3040 /* Save a 14 bit len */
10c43610 3041 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3042 buf[1] = len&0xFF;
17be1a4a 3043 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3044 } else {
3045 /* Save a 32 bit len */
10c43610 3046 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3047 if (fwrite(buf,1,1,fp) == 0) return -1;
3048 len = htonl(len);
3049 if (fwrite(&len,4,1,fp) == 0) return -1;
3050 }
3051 return 0;
3052}
3053
e3566d4b 3054/* String objects in the form "2391" "-100" without any space and with a
3055 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3056 * encoded as integers to save space */
b1befe6a 3057static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
e3566d4b 3058 long long value;
3059 char *endptr, buf[32];
3060
3061 /* Check if it's possible to encode this value as a number */
3062 value = strtoll(s, &endptr, 10);
3063 if (endptr[0] != '\0') return 0;
3064 snprintf(buf,32,"%lld",value);
3065
3066 /* If the number converted back into a string is not identical
3067 * then it's not possible to encode the string as integer */
b1befe6a 3068 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
e3566d4b 3069
3070 /* Finally check if it fits in our ranges */
3071 if (value >= -(1<<7) && value <= (1<<7)-1) {
3072 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3073 enc[1] = value&0xFF;
3074 return 2;
3075 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3076 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3077 enc[1] = value&0xFF;
3078 enc[2] = (value>>8)&0xFF;
3079 return 3;
3080 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3081 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3082 enc[1] = value&0xFF;
3083 enc[2] = (value>>8)&0xFF;
3084 enc[3] = (value>>16)&0xFF;
3085 enc[4] = (value>>24)&0xFF;
3086 return 5;
3087 } else {
3088 return 0;
3089 }
3090}
3091
b1befe6a 3092static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3093 size_t comprlen, outlen;
774e3047 3094 unsigned char byte;
3095 void *out;
3096
3097 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3098 if (len <= 4) return 0;
3099 outlen = len-4;
3a2694c4 3100 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3101 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3102 if (comprlen == 0) {
88e85998 3103 zfree(out);
774e3047 3104 return 0;
3105 }
3106 /* Data compressed! Let's save it on disk */
3107 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3108 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3109 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3110 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3111 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3112 zfree(out);
774e3047 3113 return comprlen;
3114
3115writeerr:
88e85998 3116 zfree(out);
774e3047 3117 return -1;
3118}
3119
e3566d4b 3120/* Save a string objet as [len][data] on disk. If the object is a string
3121 * representation of an integer value we try to safe it in a special form */
b1befe6a 3122static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3123 int enclen;
10c43610 3124
774e3047 3125 /* Try integer encoding */
e3566d4b 3126 if (len <= 11) {
3127 unsigned char buf[5];
b1befe6a 3128 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3129 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3130 return 0;
3131 }
3132 }
774e3047 3133
3134 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3135 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3136 if (server.rdbcompression && len > 20) {
774e3047 3137 int retval;
3138
b1befe6a 3139 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3140 if (retval == -1) return -1;
3141 if (retval > 0) return 0;
3142 /* retval == 0 means data can't be compressed, save the old way */
3143 }
3144
3145 /* Store verbatim */
10c43610 3146 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3147 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3148 return 0;
3149}
3150
942a3961 3151/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3152static int rdbSaveStringObject(FILE *fp, robj *obj) {
3153 int retval;
942a3961 3154
f2d9f50f 3155 /* Avoid incr/decr ref count business when possible.
3156 * This plays well with copy-on-write given that we are probably
3157 * in a child process (BGSAVE). Also this makes sure key objects
3158 * of swapped objects are not incRefCount-ed (an assert does not allow
3159 * this in order to avoid bugs) */
3160 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3161 obj = getDecodedObject(obj);
b1befe6a 3162 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3163 decrRefCount(obj);
3164 } else {
b1befe6a 3165 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3166 }
9d65a1bb 3167 return retval;
942a3961 3168}
3169
a7866db6 3170/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3171 * 8 bit integer specifing the length of the representation.
3172 * This 8 bit integer has special values in order to specify the following
3173 * conditions:
3174 * 253: not a number
3175 * 254: + inf
3176 * 255: - inf
3177 */
3178static int rdbSaveDoubleValue(FILE *fp, double val) {
3179 unsigned char buf[128];
3180 int len;
3181
3182 if (isnan(val)) {
3183 buf[0] = 253;
3184 len = 1;
3185 } else if (!isfinite(val)) {
3186 len = 1;
3187 buf[0] = (val < 0) ? 255 : 254;
3188 } else {
eaa256ad 3189 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3190 buf[0] = strlen((char*)buf+1);
a7866db6 3191 len = buf[0]+1;
3192 }
3193 if (fwrite(buf,len,1,fp) == 0) return -1;
3194 return 0;
3195}
3196
06233c45 3197/* Save a Redis object. */
3198static int rdbSaveObject(FILE *fp, robj *o) {
3199 if (o->type == REDIS_STRING) {
3200 /* Save a string value */
3201 if (rdbSaveStringObject(fp,o) == -1) return -1;
3202 } else if (o->type == REDIS_LIST) {
3203 /* Save a list value */
3204 list *list = o->ptr;
c7df85a4 3205 listIter li;
06233c45 3206 listNode *ln;
3207
06233c45 3208 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3209 listRewind(list,&li);
3210 while((ln = listNext(&li))) {
06233c45 3211 robj *eleobj = listNodeValue(ln);
3212
3213 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3214 }
3215 } else if (o->type == REDIS_SET) {
3216 /* Save a set value */
3217 dict *set = o->ptr;
3218 dictIterator *di = dictGetIterator(set);
3219 dictEntry *de;
3220
3221 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3222 while((de = dictNext(di)) != NULL) {
3223 robj *eleobj = dictGetEntryKey(de);
3224
3225 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3226 }
3227 dictReleaseIterator(di);
3228 } else if (o->type == REDIS_ZSET) {
3229 /* Save a set value */
3230 zset *zs = o->ptr;
3231 dictIterator *di = dictGetIterator(zs->dict);
3232 dictEntry *de;
3233
3234 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3235 while((de = dictNext(di)) != NULL) {
3236 robj *eleobj = dictGetEntryKey(de);
3237 double *score = dictGetEntryVal(de);
3238
3239 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3240 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3241 }
3242 dictReleaseIterator(di);
b1befe6a 3243 } else if (o->type == REDIS_HASH) {
3244 /* Save a hash value */
3245 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3246 unsigned char *p = zipmapRewind(o->ptr);
3247 unsigned int count = zipmapLen(o->ptr);
3248 unsigned char *key, *val;
3249 unsigned int klen, vlen;
3250
3251 if (rdbSaveLen(fp,count) == -1) return -1;
3252 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3253 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3254 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3255 }
3256 } else {
3257 dictIterator *di = dictGetIterator(o->ptr);
3258 dictEntry *de;
3259
3260 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3261 while((de = dictNext(di)) != NULL) {
3262 robj *key = dictGetEntryKey(de);
3263 robj *val = dictGetEntryVal(de);
3264
3265 if (rdbSaveStringObject(fp,key) == -1) return -1;
3266 if (rdbSaveStringObject(fp,val) == -1) return -1;
3267 }
3268 dictReleaseIterator(di);
3269 }
06233c45 3270 } else {
78409a0f 3271 redisAssert(0);
06233c45 3272 }
3273 return 0;
3274}
3275
3276/* Return the length the object will have on disk if saved with
3277 * the rdbSaveObject() function. Currently we use a trick to get
3278 * this length with very little changes to the code. In the future
3279 * we could switch to a faster solution. */
b9bc0eef 3280static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3281 if (fp == NULL) fp = server.devnull;
06233c45 3282 rewind(fp);
3283 assert(rdbSaveObject(fp,o) != 1);
3284 return ftello(fp);
3285}
3286
06224fec 3287/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3288static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3289 off_t bytes = rdbSavedObjectLen(o,fp);
06224fec 3290
3291 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3292}
3293
ed9b544e 3294/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3295static int rdbSave(char *filename) {
ed9b544e 3296 dictIterator *di = NULL;
3297 dictEntry *de;
ed9b544e 3298 FILE *fp;
3299 char tmpfile[256];
3300 int j;
bb32ede5 3301 time_t now = time(NULL);
ed9b544e 3302
2316bb3b 3303 /* Wait for I/O therads to terminate, just in case this is a
3304 * foreground-saving, to avoid seeking the swap file descriptor at the
3305 * same time. */
3306 if (server.vm_enabled)
3307 waitEmptyIOJobsQueue();
3308
a3b21203 3309 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3310 fp = fopen(tmpfile,"w");
3311 if (!fp) {
3312 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3313 return REDIS_ERR;
3314 }
f78fd11b 3315 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3316 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3317 redisDb *db = server.db+j;
3318 dict *d = db->dict;
3305306f 3319 if (dictSize(d) == 0) continue;
ed9b544e 3320 di = dictGetIterator(d);
3321 if (!di) {
3322 fclose(fp);
3323 return REDIS_ERR;
3324 }
3325
3326 /* Write the SELECT DB opcode */
f78fd11b 3327 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3328 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3329
3330 /* Iterate this DB writing every entry */
3331 while((de = dictNext(di)) != NULL) {
3332 robj *key = dictGetEntryKey(de);
3333 robj *o = dictGetEntryVal(de);
bb32ede5 3334 time_t expiretime = getExpire(db,key);
3335
3336 /* Save the expire time */
3337 if (expiretime != -1) {
3338 /* If this key is already expired skip it */
3339 if (expiretime < now) continue;
3340 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3341 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3342 }
7e69548d 3343 /* Save the key and associated value. This requires special
3344 * handling if the value is swapped out. */
996cb5f7 3345 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3346 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3347 /* Save type, key, value */
3348 if (rdbSaveType(fp,o->type) == -1) goto werr;
3349 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3350 if (rdbSaveObject(fp,o) == -1) goto werr;
3351 } else {
996cb5f7 3352 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3353 robj *po;
7e69548d 3354 /* Get a preview of the object in memory */
3355 po = vmPreviewObject(key);
7e69548d 3356 /* Save type, key, value */
3357 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3358 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3359 if (rdbSaveObject(fp,po) == -1) goto werr;
3360 /* Remove the loaded object from memory */
3361 decrRefCount(po);
7e69548d 3362 }
ed9b544e 3363 }
3364 dictReleaseIterator(di);
3365 }
3366 /* EOF opcode */
f78fd11b 3367 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3368
3369 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3370 fflush(fp);
3371 fsync(fileno(fp));
3372 fclose(fp);
3373
3374 /* Use RENAME to make sure the DB file is changed atomically only
3375 * if the generate DB file is ok. */
3376 if (rename(tmpfile,filename) == -1) {
325d1eb4 3377 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3378 unlink(tmpfile);
3379 return REDIS_ERR;
3380 }
3381 redisLog(REDIS_NOTICE,"DB saved on disk");
3382 server.dirty = 0;
3383 server.lastsave = time(NULL);
3384 return REDIS_OK;
3385
3386werr:
3387 fclose(fp);
3388 unlink(tmpfile);
3389 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3390 if (di) dictReleaseIterator(di);
3391 return REDIS_ERR;
3392}
3393
f78fd11b 3394static int rdbSaveBackground(char *filename) {
ed9b544e 3395 pid_t childpid;
3396
9d65a1bb 3397 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3398 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3399 if ((childpid = fork()) == 0) {
3400 /* Child */
054e426d 3401 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3402 close(server.fd);
f78fd11b 3403 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3404 _exit(0);
ed9b544e 3405 } else {
478c2c6f 3406 _exit(1);
ed9b544e 3407 }
3408 } else {
3409 /* Parent */
5a7c647e 3410 if (childpid == -1) {
3411 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3412 strerror(errno));
3413 return REDIS_ERR;
3414 }
ed9b544e 3415 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3416 server.bgsavechildpid = childpid;
ed9b544e 3417 return REDIS_OK;
3418 }
3419 return REDIS_OK; /* unreached */
3420}
3421
a3b21203 3422static void rdbRemoveTempFile(pid_t childpid) {
3423 char tmpfile[256];
3424
3425 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3426 unlink(tmpfile);
3427}
3428
f78fd11b 3429static int rdbLoadType(FILE *fp) {
3430 unsigned char type;
7b45bfb2 3431 if (fread(&type,1,1,fp) == 0) return -1;
3432 return type;
3433}
3434
bb32ede5 3435static time_t rdbLoadTime(FILE *fp) {
3436 int32_t t32;
3437 if (fread(&t32,4,1,fp) == 0) return -1;
3438 return (time_t) t32;
3439}
3440
e3566d4b 3441/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3442 * of this file for a description of how this are stored on disk.
3443 *
3444 * isencoded is set to 1 if the readed length is not actually a length but
3445 * an "encoding type", check the above comments for more info */
c78a8ccc 3446static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3447 unsigned char buf[2];
3448 uint32_t len;
c78a8ccc 3449 int type;
f78fd11b 3450
e3566d4b 3451 if (isencoded) *isencoded = 0;
c78a8ccc 3452 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3453 type = (buf[0]&0xC0)>>6;
3454 if (type == REDIS_RDB_6BITLEN) {
3455 /* Read a 6 bit len */
3456 return buf[0]&0x3F;
3457 } else if (type == REDIS_RDB_ENCVAL) {
3458 /* Read a 6 bit len encoding type */
3459 if (isencoded) *isencoded = 1;
3460 return buf[0]&0x3F;
3461 } else if (type == REDIS_RDB_14BITLEN) {
3462 /* Read a 14 bit len */
3463 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3464 return ((buf[0]&0x3F)<<8)|buf[1];
3465 } else {
3466 /* Read a 32 bit len */
f78fd11b 3467 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3468 return ntohl(len);
f78fd11b 3469 }
f78fd11b 3470}
3471
e3566d4b 3472static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3473 unsigned char enc[4];
3474 long long val;
3475
3476 if (enctype == REDIS_RDB_ENC_INT8) {
3477 if (fread(enc,1,1,fp) == 0) return NULL;
3478 val = (signed char)enc[0];
3479 } else if (enctype == REDIS_RDB_ENC_INT16) {
3480 uint16_t v;
3481 if (fread(enc,2,1,fp) == 0) return NULL;
3482 v = enc[0]|(enc[1]<<8);
3483 val = (int16_t)v;
3484 } else if (enctype == REDIS_RDB_ENC_INT32) {
3485 uint32_t v;
3486 if (fread(enc,4,1,fp) == 0) return NULL;
3487 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3488 val = (int32_t)v;
3489 } else {
3490 val = 0; /* anti-warning */
78409a0f 3491 redisAssert(0);
e3566d4b 3492 }
3493 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3494}
3495
c78a8ccc 3496static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3497 unsigned int len, clen;
3498 unsigned char *c = NULL;
3499 sds val = NULL;
3500
c78a8ccc 3501 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3502 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3503 if ((c = zmalloc(clen)) == NULL) goto err;
3504 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3505 if (fread(c,clen,1,fp) == 0) goto err;
3506 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3507 zfree(c);
88e85998 3508 return createObject(REDIS_STRING,val);
3509err:
3510 zfree(c);
3511 sdsfree(val);
3512 return NULL;
3513}
3514
c78a8ccc 3515static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3516 int isencoded;
3517 uint32_t len;
f78fd11b 3518 sds val;
3519
c78a8ccc 3520 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3521 if (isencoded) {
3522 switch(len) {
3523 case REDIS_RDB_ENC_INT8:
3524 case REDIS_RDB_ENC_INT16:
3525 case REDIS_RDB_ENC_INT32:
3305306f 3526 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
88e85998 3527 case REDIS_RDB_ENC_LZF:
c78a8ccc 3528 return tryObjectSharing(rdbLoadLzfStringObject(fp));
e3566d4b 3529 default:
78409a0f 3530 redisAssert(0);
e3566d4b 3531 }
3532 }
3533
f78fd11b 3534 if (len == REDIS_RDB_LENERR) return NULL;
3535 val = sdsnewlen(NULL,len);
3536 if (len && fread(val,len,1,fp) == 0) {
3537 sdsfree(val);
3538 return NULL;
3539 }
10c43610 3540 return tryObjectSharing(createObject(REDIS_STRING,val));
f78fd11b 3541}
3542
a7866db6 3543/* For information about double serialization check rdbSaveDoubleValue() */
3544static int rdbLoadDoubleValue(FILE *fp, double *val) {
3545 char buf[128];
3546 unsigned char len;
3547
3548 if (fread(&len,1,1,fp) == 0) return -1;
3549 switch(len) {
3550 case 255: *val = R_NegInf; return 0;
3551 case 254: *val = R_PosInf; return 0;
3552 case 253: *val = R_Nan; return 0;
3553 default:
3554 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3555 buf[len] = '\0';
a7866db6 3556 sscanf(buf, "%lg", val);
3557 return 0;
3558 }
3559}
3560
c78a8ccc 3561/* Load a Redis object of the specified type from the specified file.
3562 * On success a newly allocated object is returned, otherwise NULL. */
3563static robj *rdbLoadObject(int type, FILE *fp) {
3564 robj *o;
3565
bcd11906 3566 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3567 if (type == REDIS_STRING) {
3568 /* Read string value */
3569 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3570 tryObjectEncoding(o);
3571 } else if (type == REDIS_LIST || type == REDIS_SET) {
3572 /* Read list/set value */
3573 uint32_t listlen;
3574
3575 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3576 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3577 /* It's faster to expand the dict to the right size asap in order
3578 * to avoid rehashing */
3579 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3580 dictExpand(o->ptr,listlen);
c78a8ccc 3581 /* Load every single element of the list/set */
3582 while(listlen--) {
3583 robj *ele;
3584
3585 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3586 tryObjectEncoding(ele);
3587 if (type == REDIS_LIST) {
3588 listAddNodeTail((list*)o->ptr,ele);
3589 } else {
3590 dictAdd((dict*)o->ptr,ele,NULL);
3591 }
3592 }
3593 } else if (type == REDIS_ZSET) {
3594 /* Read list/set value */
ada386b2 3595 size_t zsetlen;
c78a8ccc 3596 zset *zs;
3597
3598 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3599 o = createZsetObject();
3600 zs = o->ptr;
3601 /* Load every single element of the list/set */
3602 while(zsetlen--) {
3603 robj *ele;
3604 double *score = zmalloc(sizeof(double));
3605
3606 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3607 tryObjectEncoding(ele);
3608 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3609 dictAdd(zs->dict,ele,score);
3610 zslInsert(zs->zsl,*score,ele);
3611 incrRefCount(ele); /* added to skiplist */
3612 }
ada386b2 3613 } else if (type == REDIS_HASH) {
3614 size_t hashlen;
3615
3616 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3617 o = createHashObject();
3618 /* Too many entries? Use an hash table. */
3619 if (hashlen > server.hash_max_zipmap_entries)
3620 convertToRealHash(o);
3621 /* Load every key/value, then set it into the zipmap or hash
3622 * table, as needed. */
3623 while(hashlen--) {
3624 robj *key, *val;
3625
3626 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3627 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3628 /* If we are using a zipmap and there are too big values
3629 * the object is converted to real hash table encoding. */
3630 if (o->encoding != REDIS_ENCODING_HT &&
3631 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3632 sdslen(val->ptr) > server.hash_max_zipmap_value))
3633 {
3634 convertToRealHash(o);
3635 }
3636
3637 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3638 unsigned char *zm = o->ptr;
3639
3640 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3641 val->ptr,sdslen(val->ptr),NULL);
3642 o->ptr = zm;
3643 decrRefCount(key);
3644 decrRefCount(val);
3645 } else {
3646 tryObjectEncoding(key);
3647 tryObjectEncoding(val);
3648 dictAdd((dict*)o->ptr,key,val);
ada386b2 3649 }
3650 }
c78a8ccc 3651 } else {
78409a0f 3652 redisAssert(0);
c78a8ccc 3653 }
3654 return o;
3655}
3656
f78fd11b 3657static int rdbLoad(char *filename) {
ed9b544e 3658 FILE *fp;
f78fd11b 3659 robj *keyobj = NULL;
3660 uint32_t dbid;
bb32ede5 3661 int type, retval, rdbver;
3305306f 3662 dict *d = server.db[0].dict;
bb32ede5 3663 redisDb *db = server.db+0;
f78fd11b 3664 char buf[1024];
bb32ede5 3665 time_t expiretime = -1, now = time(NULL);
b492cf00 3666 long long loadedkeys = 0;
bb32ede5 3667
ed9b544e 3668 fp = fopen(filename,"r");
3669 if (!fp) return REDIS_ERR;
3670 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3671 buf[9] = '\0';
3672 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3673 fclose(fp);
3674 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3675 return REDIS_ERR;
3676 }
f78fd11b 3677 rdbver = atoi(buf+5);
c78a8ccc 3678 if (rdbver != 1) {
f78fd11b 3679 fclose(fp);
3680 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3681 return REDIS_ERR;
3682 }
ed9b544e 3683 while(1) {
3684 robj *o;
3685
3686 /* Read type. */
f78fd11b 3687 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3688 if (type == REDIS_EXPIRETIME) {
3689 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3690 /* We read the time so we need to read the object type again */
3691 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3692 }
ed9b544e 3693 if (type == REDIS_EOF) break;
3694 /* Handle SELECT DB opcode as a special case */
3695 if (type == REDIS_SELECTDB) {
c78a8ccc 3696 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3697 goto eoferr;
ed9b544e 3698 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3699 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3700 exit(1);
3701 }
bb32ede5 3702 db = server.db+dbid;
3703 d = db->dict;
ed9b544e 3704 continue;
3705 }
3706 /* Read key */
c78a8ccc 3707 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3708 /* Read value */
3709 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3710 /* Add the new object in the hash table */
f78fd11b 3711 retval = dictAdd(d,keyobj,o);
ed9b544e 3712 if (retval == DICT_ERR) {
f78fd11b 3713 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3714 exit(1);
3715 }
bb32ede5 3716 /* Set the expire time if needed */
3717 if (expiretime != -1) {
3718 setExpire(db,keyobj,expiretime);
3719 /* Delete this key if already expired */
3720 if (expiretime < now) deleteKey(db,keyobj);
3721 expiretime = -1;
3722 }
f78fd11b 3723 keyobj = o = NULL;
b492cf00 3724 /* Handle swapping while loading big datasets when VM is on */
3725 loadedkeys++;
3726 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3727 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3728 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3729 }
3730 }
ed9b544e 3731 }
3732 fclose(fp);
3733 return REDIS_OK;
3734
3735eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3736 if (keyobj) decrRefCount(keyobj);
f80dff62 3737 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3738 exit(1);
3739 return REDIS_ERR; /* Just to avoid warning */
3740}
3741
3742/*================================== Commands =============================== */
3743
abcb223e 3744static void authCommand(redisClient *c) {
2e77c2ee 3745 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3746 c->authenticated = 1;
3747 addReply(c,shared.ok);
3748 } else {
3749 c->authenticated = 0;
fa4c0aba 3750 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3751 }
3752}
3753
ed9b544e 3754static void pingCommand(redisClient *c) {
3755 addReply(c,shared.pong);
3756}
3757
3758static void echoCommand(redisClient *c) {
dd88747b 3759 addReplyBulk(c,c->argv[1]);
ed9b544e 3760}
3761
3762/*=================================== Strings =============================== */
3763
3764static void setGenericCommand(redisClient *c, int nx) {
3765 int retval;
3766
333fd216 3767 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3768 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3769 if (retval == DICT_ERR) {
3770 if (!nx) {
1b03836c 3771 /* If the key is about a swapped value, we want a new key object
3772 * to overwrite the old. So we delete the old key in the database.
3773 * This will also make sure that swap pages about the old object
3774 * will be marked as free. */
ddfaca9d 3775 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
1b03836c 3776 incrRefCount(c->argv[1]);
3305306f 3777 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3778 incrRefCount(c->argv[2]);
3779 } else {
c937aa89 3780 addReply(c,shared.czero);
ed9b544e 3781 return;
3782 }
3783 } else {
3784 incrRefCount(c->argv[1]);
3785 incrRefCount(c->argv[2]);
3786 }
3787 server.dirty++;
3305306f 3788 removeExpire(c->db,c->argv[1]);
c937aa89 3789 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3790}
3791
3792static void setCommand(redisClient *c) {
a4d1ba9a 3793 setGenericCommand(c,0);
ed9b544e 3794}
3795
3796static void setnxCommand(redisClient *c) {
a4d1ba9a 3797 setGenericCommand(c,1);
ed9b544e 3798}
3799
322fc7d8 3800static int getGenericCommand(redisClient *c) {
dd88747b 3801 robj *o;
3802
3803 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 3804 return REDIS_OK;
dd88747b 3805
3806 if (o->type != REDIS_STRING) {
3807 addReply(c,shared.wrongtypeerr);
3808 return REDIS_ERR;
ed9b544e 3809 } else {
dd88747b 3810 addReplyBulk(c,o);
3811 return REDIS_OK;
ed9b544e 3812 }
3813}
3814
322fc7d8 3815static void getCommand(redisClient *c) {
3816 getGenericCommand(c);
3817}
3818
f6b141c5 3819static void getsetCommand(redisClient *c) {
322fc7d8 3820 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3821 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3822 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3823 } else {
3824 incrRefCount(c->argv[1]);
3825 }
3826 incrRefCount(c->argv[2]);
3827 server.dirty++;
3828 removeExpire(c->db,c->argv[1]);
3829}
3830
70003d28 3831static void mgetCommand(redisClient *c) {
70003d28 3832 int j;
3833
c937aa89 3834 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3835 for (j = 1; j < c->argc; j++) {
3305306f 3836 robj *o = lookupKeyRead(c->db,c->argv[j]);
3837 if (o == NULL) {
c937aa89 3838 addReply(c,shared.nullbulk);
70003d28 3839 } else {
70003d28 3840 if (o->type != REDIS_STRING) {
c937aa89 3841 addReply(c,shared.nullbulk);
70003d28 3842 } else {
dd88747b 3843 addReplyBulk(c,o);
70003d28 3844 }
3845 }
3846 }
3847}
3848
6c446631 3849static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3850 int j, busykeys = 0;
6c446631 3851
3852 if ((c->argc % 2) == 0) {
454d4e43 3853 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3854 return;
3855 }
3856 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3857 * set nothing at all if at least one already key exists. */
3858 if (nx) {
3859 for (j = 1; j < c->argc; j += 2) {
906573e7 3860 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3861 busykeys++;
6c446631 3862 }
3863 }
3864 }
906573e7 3865 if (busykeys) {
3866 addReply(c, shared.czero);
3867 return;
3868 }
6c446631 3869
3870 for (j = 1; j < c->argc; j += 2) {
3871 int retval;
3872
17511391 3873 tryObjectEncoding(c->argv[j+1]);
6c446631 3874 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3875 if (retval == DICT_ERR) {
3876 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3877 incrRefCount(c->argv[j+1]);
3878 } else {
3879 incrRefCount(c->argv[j]);
3880 incrRefCount(c->argv[j+1]);
3881 }
3882 removeExpire(c->db,c->argv[j]);
3883 }
3884 server.dirty += (c->argc-1)/2;
3885 addReply(c, nx ? shared.cone : shared.ok);
3886}
3887
3888static void msetCommand(redisClient *c) {
3889 msetGenericCommand(c,0);
3890}
3891
3892static void msetnxCommand(redisClient *c) {
3893 msetGenericCommand(c,1);
3894}
3895
d68ed120 3896static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 3897 long long value;
3898 int retval;
3899 robj *o;
3900
3305306f 3901 o = lookupKeyWrite(c->db,c->argv[1]);
3902 if (o == NULL) {
ed9b544e 3903 value = 0;
3904 } else {
ed9b544e 3905 if (o->type != REDIS_STRING) {
3906 value = 0;
3907 } else {
3908 char *eptr;
3909
942a3961 3910 if (o->encoding == REDIS_ENCODING_RAW)
3911 value = strtoll(o->ptr, &eptr, 10);
3912 else if (o->encoding == REDIS_ENCODING_INT)
3913 value = (long)o->ptr;
3914 else
dfc5e96c 3915 redisAssert(1 != 1);
ed9b544e 3916 }
3917 }
3918
3919 value += incr;
3920 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
942a3961 3921 tryObjectEncoding(o);
3305306f 3922 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 3923 if (retval == DICT_ERR) {
3305306f 3924 dictReplace(c->db->dict,c->argv[1],o);
3925 removeExpire(c->db,c->argv[1]);
ed9b544e 3926 } else {
3927 incrRefCount(c->argv[1]);
3928 }
3929 server.dirty++;
c937aa89 3930 addReply(c,shared.colon);
ed9b544e 3931 addReply(c,o);
3932 addReply(c,shared.crlf);
3933}
3934
3935static void incrCommand(redisClient *c) {
a4d1ba9a 3936 incrDecrCommand(c,1);
ed9b544e 3937}
3938
3939static void decrCommand(redisClient *c) {
a4d1ba9a 3940 incrDecrCommand(c,-1);
ed9b544e 3941}
3942
3943static void incrbyCommand(redisClient *c) {
d68ed120 3944 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3945 incrDecrCommand(c,incr);
ed9b544e 3946}
3947
3948static void decrbyCommand(redisClient *c) {
d68ed120 3949 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3950 incrDecrCommand(c,-incr);
ed9b544e 3951}
3952
4b00bebd 3953static void appendCommand(redisClient *c) {
3954 int retval;
3955 size_t totlen;
3956 robj *o;
3957
3958 o = lookupKeyWrite(c->db,c->argv[1]);
3959 if (o == NULL) {
3960 /* Create the key */
3961 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3962 incrRefCount(c->argv[1]);
3963 incrRefCount(c->argv[2]);
3964 totlen = stringObjectLen(c->argv[2]);
3965 } else {
3966 dictEntry *de;
3967
3968 de = dictFind(c->db->dict,c->argv[1]);
3969 assert(de != NULL);
3970
3971 o = dictGetEntryVal(de);
3972 if (o->type != REDIS_STRING) {
3973 addReply(c,shared.wrongtypeerr);
3974 return;
3975 }
3976 /* If the object is specially encoded or shared we have to make
3977 * a copy */
3978 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3979 robj *decoded = getDecodedObject(o);
3980
3981 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3982 decrRefCount(decoded);
3983 dictReplace(c->db->dict,c->argv[1],o);
3984 }
3985 /* APPEND! */
3986 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3987 o->ptr = sdscatlen(o->ptr,
3988 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3989 } else {
3990 o->ptr = sdscatprintf(o->ptr, "%ld",
3991 (unsigned long) c->argv[2]->ptr);
3992 }
3993 totlen = sdslen(o->ptr);
3994 }
3995 server.dirty++;
3996 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3997}
3998
39191553 3999static void substrCommand(redisClient *c) {
4000 robj *o;
4001 long start = atoi(c->argv[2]->ptr);
4002 long end = atoi(c->argv[3]->ptr);
dd88747b 4003 size_t rangelen, strlen;
4004 sds range;
39191553 4005
dd88747b 4006 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4007 checkType(c,o,REDIS_STRING)) return;
39191553 4008
dd88747b 4009 o = getDecodedObject(o);
4010 strlen = sdslen(o->ptr);
8fe7fad7 4011
dd88747b 4012 /* convert negative indexes */
4013 if (start < 0) start = strlen+start;
4014 if (end < 0) end = strlen+end;
4015 if (start < 0) start = 0;
4016 if (end < 0) end = 0;
39191553 4017
dd88747b 4018 /* indexes sanity checks */
4019 if (start > end || (size_t)start >= strlen) {
4020 /* Out of range start or start > end result in null reply */
4021 addReply(c,shared.nullbulk);
4022 decrRefCount(o);
4023 return;
39191553 4024 }
dd88747b 4025 if ((size_t)end >= strlen) end = strlen-1;
4026 rangelen = (end-start)+1;
4027
4028 /* Return the result */
4029 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4030 range = sdsnewlen((char*)o->ptr+start,rangelen);
4031 addReplySds(c,range);
4032 addReply(c,shared.crlf);
4033 decrRefCount(o);
39191553 4034}
4035
ed9b544e 4036/* ========================= Type agnostic commands ========================= */
4037
4038static void delCommand(redisClient *c) {
5109cdff 4039 int deleted = 0, j;
4040
4041 for (j = 1; j < c->argc; j++) {
4042 if (deleteKey(c->db,c->argv[j])) {
4043 server.dirty++;
4044 deleted++;
4045 }
4046 }
dd88747b 4047 addReplyLong(c,deleted);
ed9b544e 4048}
4049
4050static void existsCommand(redisClient *c) {
3305306f 4051 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4052}
4053
4054static void selectCommand(redisClient *c) {
4055 int id = atoi(c->argv[1]->ptr);
4056
4057 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4058 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4059 } else {
4060 addReply(c,shared.ok);
4061 }
4062}
4063
4064static void randomkeyCommand(redisClient *c) {
4065 dictEntry *de;
3305306f 4066
4067 while(1) {
4068 de = dictGetRandomKey(c->db->dict);
ce7bef07 4069 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4070 }
ed9b544e 4071 if (de == NULL) {
ce7bef07 4072 addReply(c,shared.plus);
ed9b544e 4073 addReply(c,shared.crlf);
4074 } else {
c937aa89 4075 addReply(c,shared.plus);
ed9b544e 4076 addReply(c,dictGetEntryKey(de));
4077 addReply(c,shared.crlf);
4078 }
4079}
4080
4081static void keysCommand(redisClient *c) {
4082 dictIterator *di;
4083 dictEntry *de;
4084 sds pattern = c->argv[1]->ptr;
4085 int plen = sdslen(pattern);
a3f9eec2 4086 unsigned long numkeys = 0;
ed9b544e 4087 robj *lenobj = createObject(REDIS_STRING,NULL);
4088
3305306f 4089 di = dictGetIterator(c->db->dict);
ed9b544e 4090 addReply(c,lenobj);
4091 decrRefCount(lenobj);
4092 while((de = dictNext(di)) != NULL) {
4093 robj *keyobj = dictGetEntryKey(de);
3305306f 4094
ed9b544e 4095 sds key = keyobj->ptr;
4096 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4097 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4098 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4099 addReplyBulk(c,keyobj);
3305306f 4100 numkeys++;
3305306f 4101 }
ed9b544e 4102 }
4103 }
4104 dictReleaseIterator(di);
a3f9eec2 4105 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4106}
4107
4108static void dbsizeCommand(redisClient *c) {
4109 addReplySds(c,
3305306f 4110 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4111}
4112
4113static void lastsaveCommand(redisClient *c) {
4114 addReplySds(c,
c937aa89 4115 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4116}
4117
4118static void typeCommand(redisClient *c) {
3305306f 4119 robj *o;
ed9b544e 4120 char *type;
3305306f 4121
4122 o = lookupKeyRead(c->db,c->argv[1]);
4123 if (o == NULL) {
c937aa89 4124 type = "+none";
ed9b544e 4125 } else {
ed9b544e 4126 switch(o->type) {
c937aa89 4127 case REDIS_STRING: type = "+string"; break;
4128 case REDIS_LIST: type = "+list"; break;
4129 case REDIS_SET: type = "+set"; break;
412a8bce 4130 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4131 case REDIS_HASH: type = "+hash"; break;
4132 default: type = "+unknown"; break;
ed9b544e 4133 }
4134 }
4135 addReplySds(c,sdsnew(type));
4136 addReply(c,shared.crlf);
4137}
4138
4139static void saveCommand(redisClient *c) {
9d65a1bb 4140 if (server.bgsavechildpid != -1) {
05557f6d 4141 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4142 return;
4143 }
f78fd11b 4144 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4145 addReply(c,shared.ok);
4146 } else {
4147 addReply(c,shared.err);
4148 }
4149}
4150
4151static void bgsaveCommand(redisClient *c) {
9d65a1bb 4152 if (server.bgsavechildpid != -1) {
ed9b544e 4153 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4154 return;
4155 }
f78fd11b 4156 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4157 char *status = "+Background saving started\r\n";
4158 addReplySds(c,sdsnew(status));
ed9b544e 4159 } else {
4160 addReply(c,shared.err);
4161 }
4162}
4163
4164static void shutdownCommand(redisClient *c) {
4165 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4166 /* Kill the saving child if there is a background saving in progress.
4167 We want to avoid race conditions, for instance our saving child may
4168 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4169 if (server.bgsavechildpid != -1) {
9f3c422c 4170 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4171 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4172 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4173 }
ac945e2d 4174 if (server.appendonly) {
4175 /* Append only file: fsync() the AOF and exit */
4176 fsync(server.appendfd);
054e426d 4177 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4178 exit(0);
ed9b544e 4179 } else {
ac945e2d 4180 /* Snapshotting. Perform a SYNC SAVE and exit */
4181 if (rdbSave(server.dbfilename) == REDIS_OK) {
4182 if (server.daemonize)
4183 unlink(server.pidfile);
4184 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4185 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 4186 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4187 exit(0);
4188 } else {
dd88747b 4189 /* Ooops.. error saving! The best we can do is to continue
4190 * operating. Note that if there was a background saving process,
4191 * in the next cron() Redis will be notified that the background
4192 * saving aborted, handling special stuff like slaves pending for
4193 * synchronization... */
ac945e2d 4194 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4195 addReplySds(c,
4196 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4197 }
ed9b544e 4198 }
4199}
4200
4201static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4202 robj *o;
4203
4204 /* To use the same key as src and dst is probably an error */
4205 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4206 addReply(c,shared.sameobjecterr);
ed9b544e 4207 return;
4208 }
4209
dd88747b 4210 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4211 return;
dd88747b 4212
ed9b544e 4213 incrRefCount(o);
3305306f 4214 deleteIfVolatile(c->db,c->argv[2]);
4215 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4216 if (nx) {
4217 decrRefCount(o);
c937aa89 4218 addReply(c,shared.czero);
ed9b544e 4219 return;
4220 }
3305306f 4221 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4222 } else {
4223 incrRefCount(c->argv[2]);
4224 }
3305306f 4225 deleteKey(c->db,c->argv[1]);
ed9b544e 4226 server.dirty++;
c937aa89 4227 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4228}
4229
4230static void renameCommand(redisClient *c) {
4231 renameGenericCommand(c,0);
4232}
4233
4234static void renamenxCommand(redisClient *c) {
4235 renameGenericCommand(c,1);
4236}
4237
4238static void moveCommand(redisClient *c) {
3305306f 4239 robj *o;
4240 redisDb *src, *dst;
ed9b544e 4241 int srcid;
4242
4243 /* Obtain source and target DB pointers */
3305306f 4244 src = c->db;
4245 srcid = c->db->id;
ed9b544e 4246 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4247 addReply(c,shared.outofrangeerr);
ed9b544e 4248 return;
4249 }
3305306f 4250 dst = c->db;
4251 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4252
4253 /* If the user is moving using as target the same
4254 * DB as the source DB it is probably an error. */
4255 if (src == dst) {
c937aa89 4256 addReply(c,shared.sameobjecterr);
ed9b544e 4257 return;
4258 }
4259
4260 /* Check if the element exists and get a reference */
3305306f 4261 o = lookupKeyWrite(c->db,c->argv[1]);
4262 if (!o) {
c937aa89 4263 addReply(c,shared.czero);
ed9b544e 4264 return;
4265 }
4266
4267 /* Try to add the element to the target DB */
3305306f 4268 deleteIfVolatile(dst,c->argv[1]);
4269 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4270 addReply(c,shared.czero);
ed9b544e 4271 return;
4272 }
3305306f 4273 incrRefCount(c->argv[1]);
ed9b544e 4274 incrRefCount(o);
4275
4276 /* OK! key moved, free the entry in the source DB */
3305306f 4277 deleteKey(src,c->argv[1]);
ed9b544e 4278 server.dirty++;
c937aa89 4279 addReply(c,shared.cone);
ed9b544e 4280}
4281
4282/* =================================== Lists ================================ */
4283static void pushGenericCommand(redisClient *c, int where) {
4284 robj *lobj;
ed9b544e 4285 list *list;
3305306f 4286
4287 lobj = lookupKeyWrite(c->db,c->argv[1]);
4288 if (lobj == NULL) {
95242ab5 4289 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4290 addReply(c,shared.cone);
95242ab5 4291 return;
4292 }
ed9b544e 4293 lobj = createListObject();
4294 list = lobj->ptr;
4295 if (where == REDIS_HEAD) {
6b47e12e 4296 listAddNodeHead(list,c->argv[2]);
ed9b544e 4297 } else {
6b47e12e 4298 listAddNodeTail(list,c->argv[2]);
ed9b544e 4299 }
3305306f 4300 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4301 incrRefCount(c->argv[1]);
4302 incrRefCount(c->argv[2]);
4303 } else {
ed9b544e 4304 if (lobj->type != REDIS_LIST) {
4305 addReply(c,shared.wrongtypeerr);
4306 return;
4307 }
95242ab5 4308 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4309 addReply(c,shared.cone);
95242ab5 4310 return;
4311 }
ed9b544e 4312 list = lobj->ptr;
4313 if (where == REDIS_HEAD) {
6b47e12e 4314 listAddNodeHead(list,c->argv[2]);
ed9b544e 4315 } else {
6b47e12e 4316 listAddNodeTail(list,c->argv[2]);
ed9b544e 4317 }
4318 incrRefCount(c->argv[2]);
4319 }
4320 server.dirty++;
520b5a33 4321 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4322}
4323
4324static void lpushCommand(redisClient *c) {
4325 pushGenericCommand(c,REDIS_HEAD);
4326}
4327
4328static void rpushCommand(redisClient *c) {
4329 pushGenericCommand(c,REDIS_TAIL);
4330}
4331
4332static void llenCommand(redisClient *c) {
3305306f 4333 robj *o;
ed9b544e 4334 list *l;
dd88747b 4335
4336 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4337 checkType(c,o,REDIS_LIST)) return;
ed9b544e 4338
dd88747b 4339 l = o->ptr;
4340 addReplyUlong(c,listLength(l));
ed9b544e 4341}
4342
4343static void lindexCommand(redisClient *c) {
3305306f 4344 robj *o;
ed9b544e 4345 int index = atoi(c->argv[2]->ptr);
dd88747b 4346 list *list;
4347 listNode *ln;
4348
4349 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4350 checkType(c,o,REDIS_LIST)) return;
4351 list = o->ptr;
4352
4353 ln = listIndex(list, index);
4354 if (ln == NULL) {
c937aa89 4355 addReply(c,shared.nullbulk);
ed9b544e 4356 } else {
dd88747b 4357 robj *ele = listNodeValue(ln);
4358 addReplyBulk(c,ele);
ed9b544e 4359 }
4360}
4361
4362static void lsetCommand(redisClient *c) {
3305306f 4363 robj *o;
ed9b544e 4364 int index = atoi(c->argv[2]->ptr);
dd88747b 4365 list *list;
4366 listNode *ln;
4367
4368 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4369 checkType(c,o,REDIS_LIST)) return;
4370 list = o->ptr;
4371
4372 ln = listIndex(list, index);
4373 if (ln == NULL) {
4374 addReply(c,shared.outofrangeerr);
ed9b544e 4375 } else {
dd88747b 4376 robj *ele = listNodeValue(ln);
ed9b544e 4377
dd88747b 4378 decrRefCount(ele);
4379 listNodeValue(ln) = c->argv[3];
4380 incrRefCount(c->argv[3]);
4381 addReply(c,shared.ok);
4382 server.dirty++;
ed9b544e 4383 }
4384}
4385
4386static void popGenericCommand(redisClient *c, int where) {
3305306f 4387 robj *o;
dd88747b 4388 list *list;
4389 listNode *ln;
3305306f 4390
dd88747b 4391 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4392 checkType(c,o,REDIS_LIST)) return;
4393 list = o->ptr;
ed9b544e 4394
dd88747b 4395 if (where == REDIS_HEAD)
4396 ln = listFirst(list);
4397 else
4398 ln = listLast(list);
ed9b544e 4399
dd88747b 4400 if (ln == NULL) {
4401 addReply(c,shared.nullbulk);
4402 } else {
4403 robj *ele = listNodeValue(ln);
4404 addReplyBulk(c,ele);
4405 listDelNode(list,ln);
3ea27d37 4406 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4407 server.dirty++;
ed9b544e 4408 }
4409}
4410
4411static void lpopCommand(redisClient *c) {
4412 popGenericCommand(c,REDIS_HEAD);
4413}
4414
4415static void rpopCommand(redisClient *c) {
4416 popGenericCommand(c,REDIS_TAIL);
4417}
4418
4419static void lrangeCommand(redisClient *c) {
3305306f 4420 robj *o;
ed9b544e 4421 int start = atoi(c->argv[2]->ptr);
4422 int end = atoi(c->argv[3]->ptr);
dd88747b 4423 int llen;
4424 int rangelen, j;
4425 list *list;
4426 listNode *ln;
4427 robj *ele;
4428
4429 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4430 checkType(c,o,REDIS_LIST)) return;
4431 list = o->ptr;
4432 llen = listLength(list);
4433
4434 /* convert negative indexes */
4435 if (start < 0) start = llen+start;
4436 if (end < 0) end = llen+end;
4437 if (start < 0) start = 0;
4438 if (end < 0) end = 0;
4439
4440 /* indexes sanity checks */
4441 if (start > end || start >= llen) {
4442 /* Out of range start or start > end result in empty list */
4443 addReply(c,shared.emptymultibulk);
4444 return;
4445 }
4446 if (end >= llen) end = llen-1;
4447 rangelen = (end-start)+1;
3305306f 4448
dd88747b 4449 /* Return the result in form of a multi-bulk reply */
4450 ln = listIndex(list, start);
4451 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4452 for (j = 0; j < rangelen; j++) {
4453 ele = listNodeValue(ln);
4454 addReplyBulk(c,ele);
4455 ln = ln->next;
ed9b544e 4456 }
4457}
4458
4459static void ltrimCommand(redisClient *c) {
3305306f 4460 robj *o;
ed9b544e 4461 int start = atoi(c->argv[2]->ptr);
4462 int end = atoi(c->argv[3]->ptr);
dd88747b 4463 int llen;
4464 int j, ltrim, rtrim;
4465 list *list;
4466 listNode *ln;
4467
4468 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4469 checkType(c,o,REDIS_LIST)) return;
4470 list = o->ptr;
4471 llen = listLength(list);
4472
4473 /* convert negative indexes */
4474 if (start < 0) start = llen+start;
4475 if (end < 0) end = llen+end;
4476 if (start < 0) start = 0;
4477 if (end < 0) end = 0;
4478
4479 /* indexes sanity checks */
4480 if (start > end || start >= llen) {
4481 /* Out of range start or start > end result in empty list */
4482 ltrim = llen;
4483 rtrim = 0;
ed9b544e 4484 } else {
dd88747b 4485 if (end >= llen) end = llen-1;
4486 ltrim = start;
4487 rtrim = llen-end-1;
4488 }
ed9b544e 4489
dd88747b 4490 /* Remove list elements to perform the trim */
4491 for (j = 0; j < ltrim; j++) {
4492 ln = listFirst(list);
4493 listDelNode(list,ln);
4494 }
4495 for (j = 0; j < rtrim; j++) {
4496 ln = listLast(list);
4497 listDelNode(list,ln);
ed9b544e 4498 }
3ea27d37 4499 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4500 server.dirty++;
4501 addReply(c,shared.ok);
ed9b544e 4502}
4503
4504static void lremCommand(redisClient *c) {
3305306f 4505 robj *o;
dd88747b 4506 list *list;
4507 listNode *ln, *next;
4508 int toremove = atoi(c->argv[2]->ptr);
4509 int removed = 0;
4510 int fromtail = 0;
a4d1ba9a 4511
dd88747b 4512 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4513 checkType(c,o,REDIS_LIST)) return;
4514 list = o->ptr;
4515
4516 if (toremove < 0) {
4517 toremove = -toremove;
4518 fromtail = 1;
4519 }
4520 ln = fromtail ? list->tail : list->head;
4521 while (ln) {
4522 robj *ele = listNodeValue(ln);
4523
4524 next = fromtail ? ln->prev : ln->next;
4525 if (compareStringObjects(ele,c->argv[3]) == 0) {
4526 listDelNode(list,ln);
4527 server.dirty++;
4528 removed++;
4529 if (toremove && removed == toremove) break;
ed9b544e 4530 }
dd88747b 4531 ln = next;
ed9b544e 4532 }
3ea27d37 4533 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4534 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4535}
4536
12f9d551 4537/* This is the semantic of this command:
0f5f7e9a 4538 * RPOPLPUSH srclist dstlist:
12f9d551 4539 * IF LLEN(srclist) > 0
4540 * element = RPOP srclist
4541 * LPUSH dstlist element
4542 * RETURN element
4543 * ELSE
4544 * RETURN nil
4545 * END
4546 * END
4547 *
4548 * The idea is to be able to get an element from a list in a reliable way
4549 * since the element is not just returned but pushed against another list
4550 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4551 */
0f5f7e9a 4552static void rpoplpushcommand(redisClient *c) {
12f9d551 4553 robj *sobj;
dd88747b 4554 list *srclist;
4555 listNode *ln;
4556
4557 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4558 checkType(c,sobj,REDIS_LIST)) return;
4559 srclist = sobj->ptr;
4560 ln = listLast(srclist);
12f9d551 4561
dd88747b 4562 if (ln == NULL) {
12f9d551 4563 addReply(c,shared.nullbulk);
4564 } else {
dd88747b 4565 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4566 robj *ele = listNodeValue(ln);
4567 list *dstlist;
e20fb74f 4568
dd88747b 4569 if (dobj && dobj->type != REDIS_LIST) {
4570 addReply(c,shared.wrongtypeerr);
4571 return;
4572 }
12f9d551 4573
dd88747b 4574 /* Add the element to the target list (unless it's directly
4575 * passed to some BLPOP-ing client */
4576 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4577 if (dobj == NULL) {
4578 /* Create the list if the key does not exist */
4579 dobj = createListObject();
4580 dictAdd(c->db->dict,c->argv[2],dobj);
4581 incrRefCount(c->argv[2]);
12f9d551 4582 }
dd88747b 4583 dstlist = dobj->ptr;
4584 listAddNodeHead(dstlist,ele);
4585 incrRefCount(ele);
12f9d551 4586 }
dd88747b 4587
4588 /* Send the element to the client as reply as well */
4589 addReplyBulk(c,ele);
4590
4591 /* Finally remove the element from the source list */
4592 listDelNode(srclist,ln);
3ea27d37 4593 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4594 server.dirty++;
12f9d551 4595 }
4596}
4597
ed9b544e 4598/* ==================================== Sets ================================ */
4599
4600static void saddCommand(redisClient *c) {
ed9b544e 4601 robj *set;
4602
3305306f 4603 set = lookupKeyWrite(c->db,c->argv[1]);
4604 if (set == NULL) {
ed9b544e 4605 set = createSetObject();
3305306f 4606 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4607 incrRefCount(c->argv[1]);
4608 } else {
ed9b544e 4609 if (set->type != REDIS_SET) {
c937aa89 4610 addReply(c,shared.wrongtypeerr);
ed9b544e 4611 return;
4612 }
4613 }
4614 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4615 incrRefCount(c->argv[2]);
4616 server.dirty++;
c937aa89 4617 addReply(c,shared.cone);
ed9b544e 4618 } else {
c937aa89 4619 addReply(c,shared.czero);
ed9b544e 4620 }
4621}
4622
4623static void sremCommand(redisClient *c) {
3305306f 4624 robj *set;
ed9b544e 4625
dd88747b 4626 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4627 checkType(c,set,REDIS_SET)) return;
4628
4629 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4630 server.dirty++;
4631 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4632 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4633 addReply(c,shared.cone);
ed9b544e 4634 } else {
dd88747b 4635 addReply(c,shared.czero);
ed9b544e 4636 }
4637}
4638
a4460ef4 4639static void smoveCommand(redisClient *c) {
4640 robj *srcset, *dstset;
4641
4642 srcset = lookupKeyWrite(c->db,c->argv[1]);
4643 dstset = lookupKeyWrite(c->db,c->argv[2]);
4644
4645 /* If the source key does not exist return 0, if it's of the wrong type
4646 * raise an error */
4647 if (srcset == NULL || srcset->type != REDIS_SET) {
4648 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4649 return;
4650 }
4651 /* Error if the destination key is not a set as well */
4652 if (dstset && dstset->type != REDIS_SET) {
4653 addReply(c,shared.wrongtypeerr);
4654 return;
4655 }
4656 /* Remove the element from the source set */
4657 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4658 /* Key not found in the src set! return zero */
4659 addReply(c,shared.czero);
4660 return;
4661 }
3ea27d37 4662 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4663 deleteKey(c->db,c->argv[1]);
a4460ef4 4664 server.dirty++;
4665 /* Add the element to the destination set */
4666 if (!dstset) {
4667 dstset = createSetObject();
4668 dictAdd(c->db->dict,c->argv[2],dstset);
4669 incrRefCount(c->argv[2]);
4670 }
4671 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4672 incrRefCount(c->argv[3]);
4673 addReply(c,shared.cone);
4674}
4675
ed9b544e 4676static void sismemberCommand(redisClient *c) {
3305306f 4677 robj *set;
ed9b544e 4678
dd88747b 4679 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4680 checkType(c,set,REDIS_SET)) return;
4681
4682 if (dictFind(set->ptr,c->argv[2]))
4683 addReply(c,shared.cone);
4684 else
c937aa89 4685 addReply(c,shared.czero);
ed9b544e 4686}
4687
4688static void scardCommand(redisClient *c) {
3305306f 4689 robj *o;
ed9b544e 4690 dict *s;
dd88747b 4691
4692 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4693 checkType(c,o,REDIS_SET)) return;
ed9b544e 4694
dd88747b 4695 s = o->ptr;
4696 addReplyUlong(c,dictSize(s));
ed9b544e 4697}
4698
12fea928 4699static void spopCommand(redisClient *c) {
4700 robj *set;
4701 dictEntry *de;
4702
dd88747b 4703 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4704 checkType(c,set,REDIS_SET)) return;
4705
4706 de = dictGetRandomKey(set->ptr);
4707 if (de == NULL) {
12fea928 4708 addReply(c,shared.nullbulk);
4709 } else {
dd88747b 4710 robj *ele = dictGetEntryKey(de);
12fea928 4711
dd88747b 4712 addReplyBulk(c,ele);
4713 dictDelete(set->ptr,ele);
4714 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4715 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4716 server.dirty++;
12fea928 4717 }
4718}
4719
2abb95a9 4720static void srandmemberCommand(redisClient *c) {
4721 robj *set;
4722 dictEntry *de;
4723
dd88747b 4724 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4725 checkType(c,set,REDIS_SET)) return;
4726
4727 de = dictGetRandomKey(set->ptr);
4728 if (de == NULL) {
2abb95a9 4729 addReply(c,shared.nullbulk);
4730 } else {
dd88747b 4731 robj *ele = dictGetEntryKey(de);
2abb95a9 4732
dd88747b 4733 addReplyBulk(c,ele);
2abb95a9 4734 }
4735}
4736
ed9b544e 4737static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4738 dict **d1 = (void*) s1, **d2 = (void*) s2;
4739
3305306f 4740 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4741}
4742
682ac724 4743static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4744 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4745 dictIterator *di;
4746 dictEntry *de;
4747 robj *lenobj = NULL, *dstset = NULL;
682ac724 4748 unsigned long j, cardinality = 0;
ed9b544e 4749
ed9b544e 4750 for (j = 0; j < setsnum; j++) {
4751 robj *setobj;
3305306f 4752
4753 setobj = dstkey ?
4754 lookupKeyWrite(c->db,setskeys[j]) :
4755 lookupKeyRead(c->db,setskeys[j]);
4756 if (!setobj) {
ed9b544e 4757 zfree(dv);
5faa6025 4758 if (dstkey) {
fdcaae84 4759 if (deleteKey(c->db,dstkey))
4760 server.dirty++;
0d36ded0 4761 addReply(c,shared.czero);
5faa6025 4762 } else {
4763 addReply(c,shared.nullmultibulk);
4764 }
ed9b544e 4765 return;
4766 }
ed9b544e 4767 if (setobj->type != REDIS_SET) {
4768 zfree(dv);
c937aa89 4769 addReply(c,shared.wrongtypeerr);
ed9b544e 4770 return;
4771 }
4772 dv[j] = setobj->ptr;
4773 }
4774 /* Sort sets from the smallest to largest, this will improve our
4775 * algorithm's performace */
4776 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4777
4778 /* The first thing we should output is the total number of elements...
4779 * since this is a multi-bulk write, but at this stage we don't know
4780 * the intersection set size, so we use a trick, append an empty object
4781 * to the output list and save the pointer to later modify it with the
4782 * right length */
4783 if (!dstkey) {
4784 lenobj = createObject(REDIS_STRING,NULL);
4785 addReply(c,lenobj);
4786 decrRefCount(lenobj);
4787 } else {
4788 /* If we have a target key where to store the resulting set
4789 * create this key with an empty set inside */
4790 dstset = createSetObject();
ed9b544e 4791 }
4792
4793 /* Iterate all the elements of the first (smallest) set, and test
4794 * the element against all the other sets, if at least one set does
4795 * not include the element it is discarded */
4796 di = dictGetIterator(dv[0]);
ed9b544e 4797
4798 while((de = dictNext(di)) != NULL) {
4799 robj *ele;
4800
4801 for (j = 1; j < setsnum; j++)
4802 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4803 if (j != setsnum)
4804 continue; /* at least one set does not contain the member */
4805 ele = dictGetEntryKey(de);
4806 if (!dstkey) {
dd88747b 4807 addReplyBulk(c,ele);
ed9b544e 4808 cardinality++;
4809 } else {
4810 dictAdd(dstset->ptr,ele,NULL);
4811 incrRefCount(ele);
4812 }
4813 }
4814 dictReleaseIterator(di);
4815
83cdfe18 4816 if (dstkey) {
3ea27d37 4817 /* Store the resulting set into the target, if the intersection
4818 * is not an empty set. */
83cdfe18 4819 deleteKey(c->db,dstkey);
3ea27d37 4820 if (dictSize((dict*)dstset->ptr) > 0) {
4821 dictAdd(c->db->dict,dstkey,dstset);
4822 incrRefCount(dstkey);
d36c4e97 4823 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 4824 } else {
4825 decrRefCount(dstset);
d36c4e97 4826 addReply(c,shared.czero);
3ea27d37 4827 }
40d224a9 4828 server.dirty++;
d36c4e97 4829 } else {
4830 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4831 }
ed9b544e 4832 zfree(dv);
4833}
4834
4835static void sinterCommand(redisClient *c) {
4836 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4837}
4838
4839static void sinterstoreCommand(redisClient *c) {
4840 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4841}
4842
f4f56e1d 4843#define REDIS_OP_UNION 0
4844#define REDIS_OP_DIFF 1
2830ca53 4845#define REDIS_OP_INTER 2
f4f56e1d 4846
4847static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4848 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4849 dictIterator *di;
4850 dictEntry *de;
f4f56e1d 4851 robj *dstset = NULL;
40d224a9 4852 int j, cardinality = 0;
4853
40d224a9 4854 for (j = 0; j < setsnum; j++) {
4855 robj *setobj;
4856
4857 setobj = dstkey ?
4858 lookupKeyWrite(c->db,setskeys[j]) :
4859 lookupKeyRead(c->db,setskeys[j]);
4860 if (!setobj) {
4861 dv[j] = NULL;
4862 continue;
4863 }
4864 if (setobj->type != REDIS_SET) {
4865 zfree(dv);
4866 addReply(c,shared.wrongtypeerr);
4867 return;
4868 }
4869 dv[j] = setobj->ptr;
4870 }
4871
4872 /* We need a temp set object to store our union. If the dstkey
4873 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4874 * this set object will be the resulting object to set into the target key*/
4875 dstset = createSetObject();
4876
40d224a9 4877 /* Iterate all the elements of all the sets, add every element a single
4878 * time to the result set */
4879 for (j = 0; j < setsnum; j++) {
51829ed3 4880 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4881 if (!dv[j]) continue; /* non existing keys are like empty sets */
4882
4883 di = dictGetIterator(dv[j]);
40d224a9 4884
4885 while((de = dictNext(di)) != NULL) {
4886 robj *ele;
4887
4888 /* dictAdd will not add the same element multiple times */
4889 ele = dictGetEntryKey(de);
f4f56e1d 4890 if (op == REDIS_OP_UNION || j == 0) {
4891 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4892 incrRefCount(ele);
40d224a9 4893 cardinality++;
4894 }
f4f56e1d 4895 } else if (op == REDIS_OP_DIFF) {
4896 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4897 cardinality--;
4898 }
40d224a9 4899 }
4900 }
4901 dictReleaseIterator(di);
51829ed3 4902
d36c4e97 4903 /* result set is empty? Exit asap. */
4904 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 4905 }
4906
f4f56e1d 4907 /* Output the content of the resulting set, if not in STORE mode */
4908 if (!dstkey) {
4909 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4910 di = dictGetIterator(dstset->ptr);
f4f56e1d 4911 while((de = dictNext(di)) != NULL) {
4912 robj *ele;
4913
4914 ele = dictGetEntryKey(de);
dd88747b 4915 addReplyBulk(c,ele);
f4f56e1d 4916 }
4917 dictReleaseIterator(di);
d36c4e97 4918 decrRefCount(dstset);
83cdfe18
AG
4919 } else {
4920 /* If we have a target key where to store the resulting set
4921 * create this key with the result set inside */
4922 deleteKey(c->db,dstkey);
3ea27d37 4923 if (dictSize((dict*)dstset->ptr) > 0) {
4924 dictAdd(c->db->dict,dstkey,dstset);
4925 incrRefCount(dstkey);
d36c4e97 4926 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 4927 } else {
4928 decrRefCount(dstset);
d36c4e97 4929 addReply(c,shared.czero);
3ea27d37 4930 }
40d224a9 4931 server.dirty++;
4932 }
4933 zfree(dv);
4934}
4935
4936static void sunionCommand(redisClient *c) {
f4f56e1d 4937 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 4938}
4939
4940static void sunionstoreCommand(redisClient *c) {
f4f56e1d 4941 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4942}
4943
4944static void sdiffCommand(redisClient *c) {
4945 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4946}
4947
4948static void sdiffstoreCommand(redisClient *c) {
4949 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 4950}
4951
6b47e12e 4952/* ==================================== ZSets =============================== */
4953
4954/* ZSETs are ordered sets using two data structures to hold the same elements
4955 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4956 * data structure.
4957 *
4958 * The elements are added to an hash table mapping Redis objects to scores.
4959 * At the same time the elements are added to a skip list mapping scores
4960 * to Redis objects (so objects are sorted by scores in this "view"). */
4961
4962/* This skiplist implementation is almost a C translation of the original
4963 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4964 * Alternative to Balanced Trees", modified in three ways:
4965 * a) this implementation allows for repeated values.
4966 * b) the comparison is not just by key (our 'score') but by satellite data.
4967 * c) there is a back pointer, so it's a doubly linked list with the back
4968 * pointers being only at "level 1". This allows to traverse the list
4969 * from tail to head, useful for ZREVRANGE. */
4970
4971static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4972 zskiplistNode *zn = zmalloc(sizeof(*zn));
4973
4974 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
4975 if (level > 0)
4976 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 4977 zn->score = score;
4978 zn->obj = obj;
4979 return zn;
4980}
4981
4982static zskiplist *zslCreate(void) {
4983 int j;
4984 zskiplist *zsl;
4985
4986 zsl = zmalloc(sizeof(*zsl));
4987 zsl->level = 1;
cc812361 4988 zsl->length = 0;
6b47e12e 4989 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 4990 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 4991 zsl->header->forward[j] = NULL;
94e543b5 4992
4993 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4994 if (j < ZSKIPLIST_MAXLEVEL-1)
4995 zsl->header->span[j] = 0;
69d95c3e 4996 }
e3870fab 4997 zsl->header->backward = NULL;
4998 zsl->tail = NULL;
6b47e12e 4999 return zsl;
5000}
5001
fd8ccf44 5002static void zslFreeNode(zskiplistNode *node) {
5003 decrRefCount(node->obj);
ad807e6f 5004 zfree(node->forward);
69d95c3e 5005 zfree(node->span);
fd8ccf44 5006 zfree(node);
5007}
5008
5009static void zslFree(zskiplist *zsl) {
ad807e6f 5010 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5011
ad807e6f 5012 zfree(zsl->header->forward);
69d95c3e 5013 zfree(zsl->header->span);
ad807e6f 5014 zfree(zsl->header);
fd8ccf44 5015 while(node) {
599379dd 5016 next = node->forward[0];
fd8ccf44 5017 zslFreeNode(node);
5018 node = next;
5019 }
ad807e6f 5020 zfree(zsl);
fd8ccf44 5021}
5022
6b47e12e 5023static int zslRandomLevel(void) {
5024 int level = 1;
5025 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5026 level += 1;
5027 return level;
5028}
5029
5030static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5031 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5032 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5033 int i, level;
5034
5035 x = zsl->header;
5036 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5037 /* store rank that is crossed to reach the insert position */
5038 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5039
9d60e6e4 5040 while (x->forward[i] &&
5041 (x->forward[i]->score < score ||
5042 (x->forward[i]->score == score &&
69d95c3e 5043 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5044 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5045 x = x->forward[i];
69d95c3e 5046 }
6b47e12e 5047 update[i] = x;
5048 }
6b47e12e 5049 /* we assume the key is not already inside, since we allow duplicated
5050 * scores, and the re-insertion of score and redis object should never
5051 * happpen since the caller of zslInsert() should test in the hash table
5052 * if the element is already inside or not. */
5053 level = zslRandomLevel();
5054 if (level > zsl->level) {
69d95c3e 5055 for (i = zsl->level; i < level; i++) {
2b37892e 5056 rank[i] = 0;
6b47e12e 5057 update[i] = zsl->header;
2b37892e 5058 update[i]->span[i-1] = zsl->length;
69d95c3e 5059 }
6b47e12e 5060 zsl->level = level;
5061 }
5062 x = zslCreateNode(level,score,obj);
5063 for (i = 0; i < level; i++) {
5064 x->forward[i] = update[i]->forward[i];
5065 update[i]->forward[i] = x;
69d95c3e
PN
5066
5067 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5068 if (i > 0) {
5069 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5070 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5071 }
6b47e12e 5072 }
69d95c3e
PN
5073
5074 /* increment span for untouched levels */
5075 for (i = level; i < zsl->level; i++) {
2b37892e 5076 update[i]->span[i-1]++;
69d95c3e
PN
5077 }
5078
bb975144 5079 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5080 if (x->forward[0])
5081 x->forward[0]->backward = x;
5082 else
5083 zsl->tail = x;
cc812361 5084 zsl->length++;
6b47e12e 5085}
5086
84105336
PN
5087/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5088void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5089 int i;
5090 for (i = 0; i < zsl->level; i++) {
5091 if (update[i]->forward[i] == x) {
5092 if (i > 0) {
5093 update[i]->span[i-1] += x->span[i-1] - 1;
5094 }
5095 update[i]->forward[i] = x->forward[i];
5096 } else {
5097 /* invariant: i > 0, because update[0]->forward[0]
5098 * is always equal to x */
5099 update[i]->span[i-1] -= 1;
5100 }
5101 }
5102 if (x->forward[0]) {
5103 x->forward[0]->backward = x->backward;
5104 } else {
5105 zsl->tail = x->backward;
5106 }
5107 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5108 zsl->level--;
5109 zsl->length--;
5110}
5111
50c55df5 5112/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5113static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5114 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5115 int i;
5116
5117 x = zsl->header;
5118 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5119 while (x->forward[i] &&
5120 (x->forward[i]->score < score ||
5121 (x->forward[i]->score == score &&
5122 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5123 x = x->forward[i];
5124 update[i] = x;
5125 }
5126 /* We may have multiple elements with the same score, what we need
5127 * is to find the element with both the right score and object. */
5128 x = x->forward[0];
50c55df5 5129 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
84105336 5130 zslDeleteNode(zsl, x, update);
9d60e6e4 5131 zslFreeNode(x);
9d60e6e4 5132 return 1;
5133 } else {
5134 return 0; /* not found */
e197b441 5135 }
5136 return 0; /* not found */
fd8ccf44 5137}
5138
1807985b 5139/* Delete all the elements with score between min and max from the skiplist.
5140 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5141 * Note that this function takes the reference to the hash table view of the
5142 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5143static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5144 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5145 unsigned long removed = 0;
5146 int i;
5147
5148 x = zsl->header;
5149 for (i = zsl->level-1; i >= 0; i--) {
5150 while (x->forward[i] && x->forward[i]->score < min)
5151 x = x->forward[i];
5152 update[i] = x;
5153 }
5154 /* We may have multiple elements with the same score, what we need
5155 * is to find the element with both the right score and object. */
5156 x = x->forward[0];
5157 while (x && x->score <= max) {
84105336
PN
5158 zskiplistNode *next = x->forward[0];
5159 zslDeleteNode(zsl, x, update);
1807985b 5160 dictDelete(dict,x->obj);
5161 zslFreeNode(x);
1807985b 5162 removed++;
5163 x = next;
5164 }
5165 return removed; /* not found */
5166}
1807985b 5167
9212eafd 5168/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5169 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5170static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5171 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5172 unsigned long traversed = 0, removed = 0;
5173 int i;
5174
9212eafd
PN
5175 x = zsl->header;
5176 for (i = zsl->level-1; i >= 0; i--) {
5177 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5178 traversed += i > 0 ? x->span[i-1] : 1;
5179 x = x->forward[i];
1807985b 5180 }
9212eafd
PN
5181 update[i] = x;
5182 }
5183
5184 traversed++;
5185 x = x->forward[0];
5186 while (x && traversed <= end) {
84105336
PN
5187 zskiplistNode *next = x->forward[0];
5188 zslDeleteNode(zsl, x, update);
1807985b 5189 dictDelete(dict,x->obj);
5190 zslFreeNode(x);
1807985b 5191 removed++;
9212eafd 5192 traversed++;
1807985b 5193 x = next;
5194 }
9212eafd 5195 return removed;
1807985b 5196}
5197
50c55df5 5198/* Find the first node having a score equal or greater than the specified one.
5199 * Returns NULL if there is no match. */
5200static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5201 zskiplistNode *x;
5202 int i;
5203
5204 x = zsl->header;
5205 for (i = zsl->level-1; i >= 0; i--) {
5206 while (x->forward[i] && x->forward[i]->score < score)
5207 x = x->forward[i];
5208 }
5209 /* We may have multiple elements with the same score, what we need
5210 * is to find the element with both the right score and object. */
5211 return x->forward[0];
5212}
5213
27b0ccca
PN
5214/* Find the rank for an element by both score and key.
5215 * Returns 0 when the element cannot be found, rank otherwise.
5216 * Note that the rank is 1-based due to the span of zsl->header to the
5217 * first element. */
5218static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5219 zskiplistNode *x;
5220 unsigned long rank = 0;
5221 int i;
5222
5223 x = zsl->header;
5224 for (i = zsl->level-1; i >= 0; i--) {
5225 while (x->forward[i] &&
5226 (x->forward[i]->score < score ||
5227 (x->forward[i]->score == score &&
5228 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5229 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5230 x = x->forward[i];
5231 }
5232
5233 /* x might be equal to zsl->header, so test if obj is non-NULL */
5234 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5235 return rank;
5236 }
5237 }
5238 return 0;
5239}
5240
e74825c2
PN
5241/* Finds an element by its rank. The rank argument needs to be 1-based. */
5242zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5243 zskiplistNode *x;
5244 unsigned long traversed = 0;
5245 int i;
5246
5247 x = zsl->header;
5248 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5249 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5250 {
a50ea45c 5251 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5252 x = x->forward[i];
5253 }
e74825c2
PN
5254 if (traversed == rank) {
5255 return x;
5256 }
5257 }
5258 return NULL;
5259}
5260
fd8ccf44 5261/* The actual Z-commands implementations */
5262
7db723ad 5263/* This generic command implements both ZADD and ZINCRBY.
e2665397 5264 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5265 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5266static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5267 robj *zsetobj;
5268 zset *zs;
5269 double *score;
5270
e2665397 5271 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5272 if (zsetobj == NULL) {
5273 zsetobj = createZsetObject();
e2665397 5274 dictAdd(c->db->dict,key,zsetobj);
5275 incrRefCount(key);
fd8ccf44 5276 } else {
5277 if (zsetobj->type != REDIS_ZSET) {
5278 addReply(c,shared.wrongtypeerr);
5279 return;
5280 }
5281 }
fd8ccf44 5282 zs = zsetobj->ptr;
e2665397 5283
7db723ad 5284 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5285 * needs to handle the two different conditions. It's all about setting
5286 * '*score', that is, the new score to set, to the right value. */
5287 score = zmalloc(sizeof(double));
5288 if (doincrement) {
5289 dictEntry *de;
5290
5291 /* Read the old score. If the element was not present starts from 0 */
5292 de = dictFind(zs->dict,ele);
5293 if (de) {
5294 double *oldscore = dictGetEntryVal(de);
5295 *score = *oldscore + scoreval;
5296 } else {
5297 *score = scoreval;
5298 }
5299 } else {
5300 *score = scoreval;
5301 }
5302
5303 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5304 * to both ZADD and ZINCRBY... */
e2665397 5305 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5306 /* case 1: New element */
e2665397 5307 incrRefCount(ele); /* added to hash */
5308 zslInsert(zs->zsl,*score,ele);
5309 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5310 server.dirty++;
e2665397 5311 if (doincrement)
e2665397 5312 addReplyDouble(c,*score);
91d71bfc 5313 else
5314 addReply(c,shared.cone);
fd8ccf44 5315 } else {
5316 dictEntry *de;
5317 double *oldscore;
5318
5319 /* case 2: Score update operation */
e2665397 5320 de = dictFind(zs->dict,ele);
dfc5e96c 5321 redisAssert(de != NULL);
fd8ccf44 5322 oldscore = dictGetEntryVal(de);
5323 if (*score != *oldscore) {
5324 int deleted;
5325
e2665397 5326 /* Remove and insert the element in the skip list with new score */
5327 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5328 redisAssert(deleted != 0);
e2665397 5329 zslInsert(zs->zsl,*score,ele);
5330 incrRefCount(ele);
5331 /* Update the score in the hash table */
5332 dictReplace(zs->dict,ele,score);
fd8ccf44 5333 server.dirty++;
2161a965 5334 } else {
5335 zfree(score);
fd8ccf44 5336 }
e2665397 5337 if (doincrement)
5338 addReplyDouble(c,*score);
5339 else
5340 addReply(c,shared.czero);
fd8ccf44 5341 }
5342}
5343
e2665397 5344static void zaddCommand(redisClient *c) {
5345 double scoreval;
5346
5347 scoreval = strtod(c->argv[2]->ptr,NULL);
5348 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5349}
5350
7db723ad 5351static void zincrbyCommand(redisClient *c) {
e2665397 5352 double scoreval;
5353
5354 scoreval = strtod(c->argv[2]->ptr,NULL);
5355 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5356}
5357
1b7106e7 5358static void zremCommand(redisClient *c) {
5359 robj *zsetobj;
5360 zset *zs;
dd88747b 5361 dictEntry *de;
5362 double *oldscore;
5363 int deleted;
1b7106e7 5364
dd88747b 5365 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5366 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5367
dd88747b 5368 zs = zsetobj->ptr;
5369 de = dictFind(zs->dict,c->argv[2]);
5370 if (de == NULL) {
5371 addReply(c,shared.czero);
5372 return;
1b7106e7 5373 }
dd88747b 5374 /* Delete from the skiplist */
5375 oldscore = dictGetEntryVal(de);
5376 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5377 redisAssert(deleted != 0);
5378
5379 /* Delete from the hash table */
5380 dictDelete(zs->dict,c->argv[2]);
5381 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5382 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5383 server.dirty++;
5384 addReply(c,shared.cone);
1b7106e7 5385}
5386
1807985b 5387static void zremrangebyscoreCommand(redisClient *c) {
5388 double min = strtod(c->argv[2]->ptr,NULL);
5389 double max = strtod(c->argv[3]->ptr,NULL);
dd88747b 5390 long deleted;
1807985b 5391 robj *zsetobj;
5392 zset *zs;
5393
dd88747b 5394 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5395 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5396
dd88747b 5397 zs = zsetobj->ptr;
5398 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5399 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5400 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5401 server.dirty += deleted;
5402 addReplyLong(c,deleted);
1807985b 5403}
5404
9212eafd
PN
5405static void zremrangebyrankCommand(redisClient *c) {
5406 int start = atoi(c->argv[2]->ptr);
5407 int end = atoi(c->argv[3]->ptr);
dd88747b 5408 int llen;
5409 long deleted;
9212eafd
PN
5410 robj *zsetobj;
5411 zset *zs;
5412
dd88747b 5413 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5414 checkType(c,zsetobj,REDIS_ZSET)) return;
5415 zs = zsetobj->ptr;
5416 llen = zs->zsl->length;
9212eafd 5417
dd88747b 5418 /* convert negative indexes */
5419 if (start < 0) start = llen+start;
5420 if (end < 0) end = llen+end;
5421 if (start < 0) start = 0;
5422 if (end < 0) end = 0;
9212eafd 5423
dd88747b 5424 /* indexes sanity checks */
5425 if (start > end || start >= llen) {
5426 addReply(c,shared.czero);
5427 return;
9212eafd 5428 }
dd88747b 5429 if (end >= llen) end = llen-1;
5430
5431 /* increment start and end because zsl*Rank functions
5432 * use 1-based rank */
5433 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5434 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5435 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5436 server.dirty += deleted;
5437 addReplyLong(c, deleted);
9212eafd
PN
5438}
5439
8f92e768
PN
5440typedef struct {
5441 dict *dict;
5442 double weight;
5443} zsetopsrc;
5444
5445static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5446 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5447 unsigned long size1, size2;
5448 size1 = d1->dict ? dictSize(d1->dict) : 0;
5449 size2 = d2->dict ? dictSize(d2->dict) : 0;
5450 return size1 - size2;
5451}
5452
d2764cd6
PN
5453#define REDIS_AGGR_SUM 1
5454#define REDIS_AGGR_MIN 2
5455#define REDIS_AGGR_MAX 3
5456
5457inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5458 if (aggregate == REDIS_AGGR_SUM) {
5459 *target = *target + val;
5460 } else if (aggregate == REDIS_AGGR_MIN) {
5461 *target = val < *target ? val : *target;
5462 } else if (aggregate == REDIS_AGGR_MAX) {
5463 *target = val > *target ? val : *target;
5464 } else {
5465 /* safety net */
5466 redisAssert(0 != 0);
5467 }
5468}
5469
2830ca53 5470static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5471 int i, j, zsetnum;
d2764cd6 5472 int aggregate = REDIS_AGGR_SUM;
8f92e768 5473 zsetopsrc *src;
2830ca53
PN
5474 robj *dstobj;
5475 zset *dstzset;
b287c9bb
PN
5476 dictIterator *di;
5477 dictEntry *de;
5478
2830ca53
PN
5479 /* expect zsetnum input keys to be given */
5480 zsetnum = atoi(c->argv[2]->ptr);
5481 if (zsetnum < 1) {
5482 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5483 return;
b287c9bb 5484 }
2830ca53
PN
5485
5486 /* test if the expected number of keys would overflow */
5487 if (3+zsetnum > c->argc) {
b287c9bb
PN
5488 addReply(c,shared.syntaxerr);
5489 return;
5490 }
5491
2830ca53 5492 /* read keys to be used for input */
b9eed483 5493 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5494 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5495 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5496 if (!zsetobj) {
8f92e768 5497 src[i].dict = NULL;
b287c9bb
PN
5498 } else {
5499 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5500 zfree(src);
b287c9bb
PN
5501 addReply(c,shared.wrongtypeerr);
5502 return;
5503 }
8f92e768 5504 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5505 }
2830ca53
PN
5506
5507 /* default all weights to 1 */
8f92e768 5508 src[i].weight = 1.0;
b287c9bb
PN
5509 }
5510
2830ca53
PN
5511 /* parse optional extra arguments */
5512 if (j < c->argc) {
d2764cd6 5513 int remaining = c->argc - j;
b287c9bb 5514
2830ca53 5515 while (remaining) {
d2764cd6 5516 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5517 j++; remaining--;
2830ca53 5518 for (i = 0; i < zsetnum; i++, j++, remaining--) {
8f92e768 5519 src[i].weight = strtod(c->argv[j]->ptr, NULL);
2830ca53 5520 }
d2764cd6
PN
5521 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5522 j++; remaining--;
5523 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5524 aggregate = REDIS_AGGR_SUM;
5525 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5526 aggregate = REDIS_AGGR_MIN;
5527 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5528 aggregate = REDIS_AGGR_MAX;
5529 } else {
5530 zfree(src);
5531 addReply(c,shared.syntaxerr);
5532 return;
5533 }
5534 j++; remaining--;
2830ca53 5535 } else {
8f92e768 5536 zfree(src);
2830ca53
PN
5537 addReply(c,shared.syntaxerr);
5538 return;
5539 }
5540 }
5541 }
b287c9bb 5542
d2764cd6
PN
5543 /* sort sets from the smallest to largest, this will improve our
5544 * algorithm's performance */
5545 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5546
2830ca53
PN
5547 dstobj = createZsetObject();
5548 dstzset = dstobj->ptr;
5549
5550 if (op == REDIS_OP_INTER) {
8f92e768
PN
5551 /* skip going over all entries if the smallest zset is NULL or empty */
5552 if (src[0].dict && dictSize(src[0].dict) > 0) {
5553 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5554 * from small to large, all src[i > 0].dict are non-empty too */
5555 di = dictGetIterator(src[0].dict);
2830ca53 5556 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5557 double *score = zmalloc(sizeof(double)), value;
5558 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5559
d2764cd6
PN
5560 for (j = 1; j < zsetnum; j++) {
5561 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5562 if (other) {
d2764cd6
PN
5563 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5564 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5565 } else {
5566 break;
5567 }
5568 }
b287c9bb 5569
2830ca53 5570 /* skip entry when not present in every source dict */
8f92e768 5571 if (j != zsetnum) {
2830ca53
PN
5572 zfree(score);
5573 } else {
5574 robj *o = dictGetEntryKey(de);
5575 dictAdd(dstzset->dict,o,score);
5576 incrRefCount(o); /* added to dictionary */
5577 zslInsert(dstzset->zsl,*score,o);
5578 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
5579 }
5580 }
2830ca53
PN
5581 dictReleaseIterator(di);
5582 }
5583 } else if (op == REDIS_OP_UNION) {
5584 for (i = 0; i < zsetnum; i++) {
8f92e768 5585 if (!src[i].dict) continue;
2830ca53 5586
8f92e768 5587 di = dictGetIterator(src[i].dict);
2830ca53
PN
5588 while((de = dictNext(di)) != NULL) {
5589 /* skip key when already processed */
5590 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5591
d2764cd6
PN
5592 double *score = zmalloc(sizeof(double)), value;
5593 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5594
d2764cd6
PN
5595 /* because the zsets are sorted by size, its only possible
5596 * for sets at larger indices to hold this entry */
5597 for (j = (i+1); j < zsetnum; j++) {
5598 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5599 if (other) {
d2764cd6
PN
5600 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5601 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5602 }
5603 }
b287c9bb 5604
2830ca53
PN
5605 robj *o = dictGetEntryKey(de);
5606 dictAdd(dstzset->dict,o,score);
5607 incrRefCount(o); /* added to dictionary */
5608 zslInsert(dstzset->zsl,*score,o);
5609 incrRefCount(o); /* added to skiplist */
5610 }
5611 dictReleaseIterator(di);
b287c9bb 5612 }
2830ca53
PN
5613 } else {
5614 /* unknown operator */
5615 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
5616 }
5617
5618 deleteKey(c->db,dstkey);
3ea27d37 5619 if (dstzset->zsl->length) {
5620 dictAdd(c->db->dict,dstkey,dstobj);
5621 incrRefCount(dstkey);
5622 addReplyLong(c, dstzset->zsl->length);
5623 server.dirty++;
5624 } else {
5625 decrRefCount(dstzset);
5626 addReply(c, shared.czero);
5627 }
8f92e768 5628 zfree(src);
b287c9bb
PN
5629}
5630
2830ca53
PN
5631static void zunionCommand(redisClient *c) {
5632 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
5633}
5634
2830ca53
PN
5635static void zinterCommand(redisClient *c) {
5636 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
5637}
5638
e3870fab 5639static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5640 robj *o;
5641 int start = atoi(c->argv[2]->ptr);
5642 int end = atoi(c->argv[3]->ptr);
752da584 5643 int withscores = 0;
dd88747b 5644 int llen;
5645 int rangelen, j;
5646 zset *zsetobj;
5647 zskiplist *zsl;
5648 zskiplistNode *ln;
5649 robj *ele;
752da584 5650
5651 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5652 withscores = 1;
5653 } else if (c->argc >= 5) {
5654 addReply(c,shared.syntaxerr);
5655 return;
5656 }
cc812361 5657
dd88747b 5658 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5659 checkType(c,o,REDIS_ZSET)) return;
5660 zsetobj = o->ptr;
5661 zsl = zsetobj->zsl;
5662 llen = zsl->length;
cc812361 5663
dd88747b 5664 /* convert negative indexes */
5665 if (start < 0) start = llen+start;
5666 if (end < 0) end = llen+end;
5667 if (start < 0) start = 0;
5668 if (end < 0) end = 0;
cc812361 5669
dd88747b 5670 /* indexes sanity checks */
5671 if (start > end || start >= llen) {
5672 /* Out of range start or start > end result in empty list */
5673 addReply(c,shared.emptymultibulk);
5674 return;
5675 }
5676 if (end >= llen) end = llen-1;
5677 rangelen = (end-start)+1;
cc812361 5678
dd88747b 5679 /* check if starting point is trivial, before searching
5680 * the element in log(N) time */
5681 if (reverse) {
5682 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5683 } else {
5684 ln = start == 0 ?
5685 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5686 }
cc812361 5687
dd88747b 5688 /* Return the result in form of a multi-bulk reply */
5689 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5690 withscores ? (rangelen*2) : rangelen));
5691 for (j = 0; j < rangelen; j++) {
5692 ele = ln->obj;
5693 addReplyBulk(c,ele);
5694 if (withscores)
5695 addReplyDouble(c,ln->score);
5696 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5697 }
5698}
5699
e3870fab 5700static void zrangeCommand(redisClient *c) {
5701 zrangeGenericCommand(c,0);
5702}
5703
5704static void zrevrangeCommand(redisClient *c) {
5705 zrangeGenericCommand(c,1);
5706}
5707
f44dd428 5708/* This command implements both ZRANGEBYSCORE and ZCOUNT.
5709 * If justcount is non-zero, just the count is returned. */
5710static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 5711 robj *o;
f44dd428 5712 double min, max;
5713 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 5714 int offset = 0, limit = -1;
0500ef27
SH
5715 int withscores = 0;
5716 int badsyntax = 0;
5717
f44dd428 5718 /* Parse the min-max interval. If one of the values is prefixed
5719 * by the "(" character, it's considered "open". For instance
5720 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5721 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5722 if (((char*)c->argv[2]->ptr)[0] == '(') {
5723 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5724 minex = 1;
5725 } else {
5726 min = strtod(c->argv[2]->ptr,NULL);
5727 }
5728 if (((char*)c->argv[3]->ptr)[0] == '(') {
5729 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5730 maxex = 1;
5731 } else {
5732 max = strtod(c->argv[3]->ptr,NULL);
5733 }
5734
5735 /* Parse "WITHSCORES": note that if the command was called with
5736 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5737 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 5738 if (c->argc == 5 || c->argc == 8) {
3a3978b1 5739 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5740 withscores = 1;
5741 else
5742 badsyntax = 1;
0500ef27 5743 }
3a3978b1 5744 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 5745 badsyntax = 1;
0500ef27 5746 if (badsyntax) {
454d4e43 5747 addReplySds(c,
5748 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5749 return;
0500ef27
SH
5750 }
5751
f44dd428 5752 /* Parse "LIMIT" */
0500ef27 5753 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 5754 addReply(c,shared.syntaxerr);
5755 return;
0500ef27 5756 } else if (c->argc == (7 + withscores)) {
80181f78 5757 offset = atoi(c->argv[5]->ptr);
5758 limit = atoi(c->argv[6]->ptr);
0b13687c 5759 if (offset < 0) offset = 0;
80181f78 5760 }
50c55df5 5761
f44dd428 5762 /* Ok, lookup the key and get the range */
50c55df5 5763 o = lookupKeyRead(c->db,c->argv[1]);
5764 if (o == NULL) {
f44dd428 5765 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
50c55df5 5766 } else {
5767 if (o->type != REDIS_ZSET) {
5768 addReply(c,shared.wrongtypeerr);
5769 } else {
5770 zset *zsetobj = o->ptr;
5771 zskiplist *zsl = zsetobj->zsl;
5772 zskiplistNode *ln;
f44dd428 5773 robj *ele, *lenobj = NULL;
5774 unsigned long rangelen = 0;
50c55df5 5775
f44dd428 5776 /* Get the first node with the score >= min, or with
5777 * score > min if 'minex' is true. */
50c55df5 5778 ln = zslFirstWithScore(zsl,min);
f44dd428 5779 while (minex && ln && ln->score == min) ln = ln->forward[0];
5780
50c55df5 5781 if (ln == NULL) {
5782 /* No element matching the speciifed interval */
f44dd428 5783 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 5784 return;
5785 }
5786
5787 /* We don't know in advance how many matching elements there
5788 * are in the list, so we push this object that will represent
5789 * the multi-bulk length in the output buffer, and will "fix"
5790 * it later */
f44dd428 5791 if (!justcount) {
5792 lenobj = createObject(REDIS_STRING,NULL);
5793 addReply(c,lenobj);
5794 decrRefCount(lenobj);
5795 }
50c55df5 5796
f44dd428 5797 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 5798 if (offset) {
5799 offset--;
5800 ln = ln->forward[0];
5801 continue;
5802 }
5803 if (limit == 0) break;
f44dd428 5804 if (!justcount) {
5805 ele = ln->obj;
dd88747b 5806 addReplyBulk(c,ele);
f44dd428 5807 if (withscores)
5808 addReplyDouble(c,ln->score);
5809 }
50c55df5 5810 ln = ln->forward[0];
5811 rangelen++;
80181f78 5812 if (limit > 0) limit--;
50c55df5 5813 }
f44dd428 5814 if (justcount) {
5815 addReplyLong(c,(long)rangelen);
5816 } else {
5817 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5818 withscores ? (rangelen*2) : rangelen);
5819 }
50c55df5 5820 }
5821 }
5822}
5823
f44dd428 5824static void zrangebyscoreCommand(redisClient *c) {
5825 genericZrangebyscoreCommand(c,0);
5826}
5827
5828static void zcountCommand(redisClient *c) {
5829 genericZrangebyscoreCommand(c,1);
5830}
5831
3c41331e 5832static void zcardCommand(redisClient *c) {
e197b441 5833 robj *o;
5834 zset *zs;
dd88747b 5835
5836 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5837 checkType(c,o,REDIS_ZSET)) return;
5838
5839 zs = o->ptr;
5840 addReplyUlong(c,zs->zsl->length);
e197b441 5841}
5842
6e333bbe 5843static void zscoreCommand(redisClient *c) {
5844 robj *o;
5845 zset *zs;
dd88747b 5846 dictEntry *de;
5847
5848 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5849 checkType(c,o,REDIS_ZSET)) return;
5850
5851 zs = o->ptr;
5852 de = dictFind(zs->dict,c->argv[2]);
5853 if (!de) {
96d8b4ee 5854 addReply(c,shared.nullbulk);
6e333bbe 5855 } else {
dd88747b 5856 double *score = dictGetEntryVal(de);
6e333bbe 5857
dd88747b 5858 addReplyDouble(c,*score);
6e333bbe 5859 }
5860}
5861
798d9e55 5862static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 5863 robj *o;
dd88747b 5864 zset *zs;
5865 zskiplist *zsl;
5866 dictEntry *de;
5867 unsigned long rank;
5868 double *score;
5869
5870 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5871 checkType(c,o,REDIS_ZSET)) return;
5872
5873 zs = o->ptr;
5874 zsl = zs->zsl;
5875 de = dictFind(zs->dict,c->argv[2]);
5876 if (!de) {
69d95c3e
PN
5877 addReply(c,shared.nullbulk);
5878 return;
5879 }
69d95c3e 5880
dd88747b 5881 score = dictGetEntryVal(de);
5882 rank = zslGetRank(zsl, *score, c->argv[2]);
5883 if (rank) {
5884 if (reverse) {
5885 addReplyLong(c, zsl->length - rank);
27b0ccca 5886 } else {
dd88747b 5887 addReplyLong(c, rank-1);
69d95c3e 5888 }
dd88747b 5889 } else {
5890 addReply(c,shared.nullbulk);
978c2c94 5891 }
5892}
5893
798d9e55
PN
5894static void zrankCommand(redisClient *c) {
5895 zrankGenericCommand(c, 0);
5896}
5897
5898static void zrevrankCommand(redisClient *c) {
5899 zrankGenericCommand(c, 1);
5900}
5901
cbba7dd7 5902/* =================================== Hashes =============================== */
978c2c94 5903static void hsetCommand(redisClient *c) {
5904 int update = 0;
5905 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5906
5907 if (o == NULL) {
5908 o = createHashObject();
5909 dictAdd(c->db->dict,c->argv[1],o);
5910 incrRefCount(c->argv[1]);
5911 } else {
5912 if (o->type != REDIS_HASH) {
5913 addReply(c,shared.wrongtypeerr);
5914 return;
5915 }
5916 }
bae2c7ec 5917 /* We want to convert the zipmap into an hash table right now if the
5918 * entry to be added is too big. Note that we check if the object
5919 * is integer encoded before to try fetching the length in the test below.
5920 * This is because integers are small, but currently stringObjectLen()
5921 * performs a slow conversion: not worth it. */
5922 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5923 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5924 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5925 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5926 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5927 {
5928 convertToRealHash(o);
5929 }
5930
978c2c94 5931 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5932 unsigned char *zm = o->ptr;
b1befe6a 5933 robj *valobj = getDecodedObject(c->argv[3]);
978c2c94 5934
5935 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
b1befe6a 5936 valobj->ptr,sdslen(valobj->ptr),&update);
5937 decrRefCount(valobj);
cbba7dd7 5938 o->ptr = zm;
bae2c7ec 5939
5940 /* And here there is the second check for hash conversion...
5941 * we want to do it only if the operation was not just an update as
5942 * zipmapLen() is O(N). */
5943 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
5944 convertToRealHash(o);
978c2c94 5945 } else {
bae2c7ec 5946 tryObjectEncoding(c->argv[2]);
5947 /* note that c->argv[3] is already encoded, as the latest arg
5948 * of a bulk command is always integer encoded if possible. */
2069d06a 5949 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
978c2c94 5950 incrRefCount(c->argv[2]);
5951 } else {
5952 update = 1;
5953 }
5954 incrRefCount(c->argv[3]);
5955 }
5956 server.dirty++;
5957 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5958}
5959
01426b05
PN
5960static void hincrbyCommand(redisClient *c) {
5961 int update = 0;
5962 long long value = 0, incr = 0;
5963 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5964
5965 if (o == NULL) {
5966 o = createHashObject();
5967 dictAdd(c->db->dict,c->argv[1],o);
5968 incrRefCount(c->argv[1]);
5969 } else {
5970 if (o->type != REDIS_HASH) {
5971 addReply(c,shared.wrongtypeerr);
5972 return;
5973 }
5974 }
5975
5976 robj *o_incr = getDecodedObject(c->argv[3]);
5977 incr = strtoll(o_incr->ptr, NULL, 10);
5978 decrRefCount(o_incr);
5979
5980 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5981 unsigned char *zm = o->ptr;
5982 unsigned char *zval;
5983 unsigned int zvlen;
5984
5985 /* Find value if already present in hash */
5986 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5987 &zval,&zvlen)) {
5988 /* strtoll needs the char* to have a trailing \0, but
5989 * the zipmap doesn't include them. */
5990 sds szval = sdsnewlen(zval, zvlen);
5991 value = strtoll(szval,NULL,10);
5992 sdsfree(szval);
5993 }
5994
5995 value += incr;
5996 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
5997 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5998 (unsigned char*)svalue,sdslen(svalue),&update);
5999 sdsfree(svalue);
6000 o->ptr = zm;
6001
6002 /* Check if the zipmap needs to be converted
6003 * if this was not an update. */
6004 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
6005 convertToRealHash(o);
6006 } else {
6007 robj *hval;
6008 dictEntry *de;
6009
6010 /* Find value if already present in hash */
6011 de = dictFind(o->ptr,c->argv[2]);
6012 if (de != NULL) {
6013 hval = dictGetEntryVal(de);
6014 if (hval->encoding == REDIS_ENCODING_RAW)
6015 value = strtoll(hval->ptr,NULL,10);
6016 else if (hval->encoding == REDIS_ENCODING_INT)
6017 value = (long)hval->ptr;
6018 else
6019 redisAssert(1 != 1);
6020 }
6021
6022 value += incr;
6023 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6024 tryObjectEncoding(hval);
01426b05
PN
6025 if (dictReplace(o->ptr,c->argv[2],hval)) {
6026 incrRefCount(c->argv[2]);
6027 }
6028 }
6029
6030 server.dirty++;
6031 addReplyLong(c, value);
6032}
6033
978c2c94 6034static void hgetCommand(redisClient *c) {
dd88747b 6035 robj *o;
978c2c94 6036
dd88747b 6037 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6038 checkType(c,o,REDIS_HASH)) return;
6039
6040 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6041 unsigned char *zm = o->ptr;
6042 unsigned char *val;
6043 unsigned int vlen;
164ee595 6044 robj *field;
dd88747b 6045
164ee595 6046 field = getDecodedObject(c->argv[2]);
6047 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
dd88747b 6048 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6049 addReplySds(c,sdsnewlen(val,vlen));
6050 addReply(c,shared.crlf);
164ee595 6051 decrRefCount(field);
dd88747b 6052 return;
6053 } else {
6054 addReply(c,shared.nullbulk);
164ee595 6055 decrRefCount(field);
bcd11906 6056 return;
6057 }
dd88747b 6058 } else {
6059 struct dictEntry *de;
bcd11906 6060
dd88747b 6061 de = dictFind(o->ptr,c->argv[2]);
6062 if (de == NULL) {
6063 addReply(c,shared.nullbulk);
978c2c94 6064 } else {
dd88747b 6065 robj *e = dictGetEntryVal(de);
978c2c94 6066
dd88747b 6067 addReplyBulk(c,e);
978c2c94 6068 }
69d95c3e 6069 }
69d95c3e
PN
6070}
6071
07efaf74 6072static void hdelCommand(redisClient *c) {
dd88747b 6073 robj *o;
6074 int deleted = 0;
07efaf74 6075
dd88747b 6076 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6077 checkType(c,o,REDIS_HASH)) return;
07efaf74 6078
dd88747b 6079 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
2a1198b4 6080 robj *field = getDecodedObject(c->argv[2]);
6081
dd88747b 6082 o->ptr = zipmapDel((unsigned char*) o->ptr,
2a1198b4 6083 (unsigned char*) field->ptr,
6084 sdslen(field->ptr), &deleted);
6085 decrRefCount(field);
3ea27d37 6086 if (zipmapLen((unsigned char*) o->ptr) == 0)
6087 deleteKey(c->db,c->argv[1]);
dd88747b 6088 } else {
6089 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
3ea27d37 6090 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6091 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
07efaf74 6092 }
c77169b7 6093 if (deleted) server.dirty++;
dd88747b 6094 addReply(c,deleted ? shared.cone : shared.czero);
07efaf74 6095}
6096
92b27fe9 6097static void hlenCommand(redisClient *c) {
6098 robj *o;
6099 unsigned long len;
6100
dd88747b 6101 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6102 checkType(c,o,REDIS_HASH)) return;
6103
6104 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6105 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6106 addReplyUlong(c,len);
6107}
6108
78409a0f 6109#define REDIS_GETALL_KEYS 1
6110#define REDIS_GETALL_VALS 2
6111static void genericHgetallCommand(redisClient *c, int flags) {
6112 robj *o, *lenobj;
6113 unsigned long count = 0;
6114
6115 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6116 || checkType(c,o,REDIS_HASH)) return;
6117
6118 lenobj = createObject(REDIS_STRING,NULL);
6119 addReply(c,lenobj);
6120 decrRefCount(lenobj);
6121
6122 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6123 unsigned char *p = zipmapRewind(o->ptr);
6124 unsigned char *field, *val;
6125 unsigned int flen, vlen;
6126
6127 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6128 robj *aux;
6129
6130 if (flags & REDIS_GETALL_KEYS) {
6131 aux = createStringObject((char*)field,flen);
6132 addReplyBulk(c,aux);
6133 decrRefCount(aux);
6134 count++;
6135 }
6136 if (flags & REDIS_GETALL_VALS) {
6137 aux = createStringObject((char*)val,vlen);
6138 addReplyBulk(c,aux);
6139 decrRefCount(aux);
6140 count++;
6141 }
6142 }
6143 } else {
6144 dictIterator *di = dictGetIterator(o->ptr);
6145 dictEntry *de;
6146
6147 while((de = dictNext(di)) != NULL) {
6148 robj *fieldobj = dictGetEntryKey(de);
6149 robj *valobj = dictGetEntryVal(de);
6150
6151 if (flags & REDIS_GETALL_KEYS) {
6152 addReplyBulk(c,fieldobj);
6153 count++;
6154 }
6155 if (flags & REDIS_GETALL_VALS) {
6156 addReplyBulk(c,valobj);
6157 count++;
6158 }
6159 }
6160 dictReleaseIterator(di);
6161 }
6162 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6163}
6164
6165static void hkeysCommand(redisClient *c) {
6166 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6167}
6168
6169static void hvalsCommand(redisClient *c) {
6170 genericHgetallCommand(c,REDIS_GETALL_VALS);
6171}
6172
6173static void hgetallCommand(redisClient *c) {
6174 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6175}
6176
a86f14b1 6177static void hexistsCommand(redisClient *c) {
6178 robj *o;
6179 int exists = 0;
6180
6181 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6182 checkType(c,o,REDIS_HASH)) return;
6183
6184 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6185 robj *field;
6186 unsigned char *zm = o->ptr;
6187
6188 field = getDecodedObject(c->argv[2]);
6189 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6190 decrRefCount(field);
6191 } else {
6192 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6193 }
6194 addReply(c,exists ? shared.cone : shared.czero);
6195}
6196
ada386b2 6197static void convertToRealHash(robj *o) {
6198 unsigned char *key, *val, *p, *zm = o->ptr;
6199 unsigned int klen, vlen;
6200 dict *dict = dictCreate(&hashDictType,NULL);
6201
6202 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6203 p = zipmapRewind(zm);
6204 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6205 robj *keyobj, *valobj;
6206
6207 keyobj = createStringObject((char*)key,klen);
6208 valobj = createStringObject((char*)val,vlen);
6209 tryObjectEncoding(keyobj);
6210 tryObjectEncoding(valobj);
6211 dictAdd(dict,keyobj,valobj);
6212 }
6213 o->encoding = REDIS_ENCODING_HT;
6214 o->ptr = dict;
6215 zfree(zm);
6216}
6217
6b47e12e 6218/* ========================= Non type-specific commands ==================== */
6219
ed9b544e 6220static void flushdbCommand(redisClient *c) {
ca37e9cd 6221 server.dirty += dictSize(c->db->dict);
3305306f 6222 dictEmpty(c->db->dict);
6223 dictEmpty(c->db->expires);
ed9b544e 6224 addReply(c,shared.ok);
ed9b544e 6225}
6226
6227static void flushallCommand(redisClient *c) {
ca37e9cd 6228 server.dirty += emptyDb();
ed9b544e 6229 addReply(c,shared.ok);
500ece7c 6230 if (server.bgsavechildpid != -1) {
6231 kill(server.bgsavechildpid,SIGKILL);
6232 rdbRemoveTempFile(server.bgsavechildpid);
6233 }
f78fd11b 6234 rdbSave(server.dbfilename);
ca37e9cd 6235 server.dirty++;
ed9b544e 6236}
6237
56906eef 6238static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6239 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6240 so->type = type;
6241 so->pattern = pattern;
6242 return so;
6243}
6244
6245/* Return the value associated to the key with a name obtained
6246 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 6247static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 6248 char *p;
6249 sds spat, ssub;
6250 robj keyobj;
6251 int prefixlen, sublen, postfixlen;
ed9b544e 6252 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6253 struct {
f1017b3f 6254 long len;
6255 long free;
ed9b544e 6256 char buf[REDIS_SORTKEY_MAX+1];
6257 } keyname;
6258
28173a49 6259 /* If the pattern is "#" return the substitution object itself in order
6260 * to implement the "SORT ... GET #" feature. */
6261 spat = pattern->ptr;
6262 if (spat[0] == '#' && spat[1] == '\0') {
6263 return subst;
6264 }
6265
6266 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6267 * a decoded object on the fly. Otherwise getDecodedObject will just
6268 * increment the ref count, that we'll decrement later. */
6269 subst = getDecodedObject(subst);
942a3961 6270
ed9b544e 6271 ssub = subst->ptr;
6272 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6273 p = strchr(spat,'*');
ed5a857a 6274 if (!p) {
6275 decrRefCount(subst);
6276 return NULL;
6277 }
ed9b544e 6278
6279 prefixlen = p-spat;
6280 sublen = sdslen(ssub);
6281 postfixlen = sdslen(spat)-(prefixlen+1);
6282 memcpy(keyname.buf,spat,prefixlen);
6283 memcpy(keyname.buf+prefixlen,ssub,sublen);
6284 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6285 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6286 keyname.len = prefixlen+sublen+postfixlen;
6287
dfc5e96c 6288 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 6289 decrRefCount(subst);
6290
a4d1ba9a 6291 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 6292 return lookupKeyRead(db,&keyobj);
ed9b544e 6293}
6294
6295/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6296 * the additional parameter is not standard but a BSD-specific we have to
6297 * pass sorting parameters via the global 'server' structure */
6298static int sortCompare(const void *s1, const void *s2) {
6299 const redisSortObject *so1 = s1, *so2 = s2;
6300 int cmp;
6301
6302 if (!server.sort_alpha) {
6303 /* Numeric sorting. Here it's trivial as we precomputed scores */
6304 if (so1->u.score > so2->u.score) {
6305 cmp = 1;
6306 } else if (so1->u.score < so2->u.score) {
6307 cmp = -1;
6308 } else {
6309 cmp = 0;
6310 }
6311 } else {
6312 /* Alphanumeric sorting */
6313 if (server.sort_bypattern) {
6314 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6315 /* At least one compare object is NULL */
6316 if (so1->u.cmpobj == so2->u.cmpobj)
6317 cmp = 0;
6318 else if (so1->u.cmpobj == NULL)
6319 cmp = -1;
6320 else
6321 cmp = 1;
6322 } else {
6323 /* We have both the objects, use strcoll */
6324 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6325 }
6326 } else {
6327 /* Compare elements directly */
9d65a1bb 6328 robj *dec1, *dec2;
6329
6330 dec1 = getDecodedObject(so1->obj);
6331 dec2 = getDecodedObject(so2->obj);
6332 cmp = strcoll(dec1->ptr,dec2->ptr);
6333 decrRefCount(dec1);
6334 decrRefCount(dec2);
ed9b544e 6335 }
6336 }
6337 return server.sort_desc ? -cmp : cmp;
6338}
6339
6340/* The SORT command is the most complex command in Redis. Warning: this code
6341 * is optimized for speed and a bit less for readability */
6342static void sortCommand(redisClient *c) {
ed9b544e 6343 list *operations;
6344 int outputlen = 0;
6345 int desc = 0, alpha = 0;
6346 int limit_start = 0, limit_count = -1, start, end;
6347 int j, dontsort = 0, vectorlen;
6348 int getop = 0; /* GET operation counter */
443c6409 6349 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6350 redisSortObject *vector; /* Resulting vector to sort */
6351
6352 /* Lookup the key to sort. It must be of the right types */
3305306f 6353 sortval = lookupKeyRead(c->db,c->argv[1]);
6354 if (sortval == NULL) {
d922ae65 6355 addReply(c,shared.nullmultibulk);
ed9b544e 6356 return;
6357 }
a5eb649b 6358 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6359 sortval->type != REDIS_ZSET)
6360 {
c937aa89 6361 addReply(c,shared.wrongtypeerr);
ed9b544e 6362 return;
6363 }
6364
6365 /* Create a list of operations to perform for every sorted element.
6366 * Operations can be GET/DEL/INCR/DECR */
6367 operations = listCreate();
092dac2a 6368 listSetFreeMethod(operations,zfree);
ed9b544e 6369 j = 2;
6370
6371 /* Now we need to protect sortval incrementing its count, in the future
6372 * SORT may have options able to overwrite/delete keys during the sorting
6373 * and the sorted key itself may get destroied */
6374 incrRefCount(sortval);
6375
6376 /* The SORT command has an SQL-alike syntax, parse it */
6377 while(j < c->argc) {
6378 int leftargs = c->argc-j-1;
6379 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6380 desc = 0;
6381 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6382 desc = 1;
6383 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6384 alpha = 1;
6385 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6386 limit_start = atoi(c->argv[j+1]->ptr);
6387 limit_count = atoi(c->argv[j+2]->ptr);
6388 j+=2;
443c6409 6389 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6390 storekey = c->argv[j+1];
6391 j++;
ed9b544e 6392 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6393 sortby = c->argv[j+1];
6394 /* If the BY pattern does not contain '*', i.e. it is constant,
6395 * we don't need to sort nor to lookup the weight keys. */
6396 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6397 j++;
6398 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6399 listAddNodeTail(operations,createSortOperation(
6400 REDIS_SORT_GET,c->argv[j+1]));
6401 getop++;
6402 j++;
ed9b544e 6403 } else {
6404 decrRefCount(sortval);
6405 listRelease(operations);
c937aa89 6406 addReply(c,shared.syntaxerr);
ed9b544e 6407 return;
6408 }
6409 j++;
6410 }
6411
6412 /* Load the sorting vector with all the objects to sort */
a5eb649b 6413 switch(sortval->type) {
6414 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6415 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6416 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 6417 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 6418 }
ed9b544e 6419 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6420 j = 0;
a5eb649b 6421
ed9b544e 6422 if (sortval->type == REDIS_LIST) {
6423 list *list = sortval->ptr;
6208b3a7 6424 listNode *ln;
c7df85a4 6425 listIter li;
6208b3a7 6426
c7df85a4 6427 listRewind(list,&li);
6428 while((ln = listNext(&li))) {
ed9b544e 6429 robj *ele = ln->value;
6430 vector[j].obj = ele;
6431 vector[j].u.score = 0;
6432 vector[j].u.cmpobj = NULL;
ed9b544e 6433 j++;
6434 }
6435 } else {
a5eb649b 6436 dict *set;
ed9b544e 6437 dictIterator *di;
6438 dictEntry *setele;
6439
a5eb649b 6440 if (sortval->type == REDIS_SET) {
6441 set = sortval->ptr;
6442 } else {
6443 zset *zs = sortval->ptr;
6444 set = zs->dict;
6445 }
6446
ed9b544e 6447 di = dictGetIterator(set);
ed9b544e 6448 while((setele = dictNext(di)) != NULL) {
6449 vector[j].obj = dictGetEntryKey(setele);
6450 vector[j].u.score = 0;
6451 vector[j].u.cmpobj = NULL;
6452 j++;
6453 }
6454 dictReleaseIterator(di);
6455 }
dfc5e96c 6456 redisAssert(j == vectorlen);
ed9b544e 6457
6458 /* Now it's time to load the right scores in the sorting vector */
6459 if (dontsort == 0) {
6460 for (j = 0; j < vectorlen; j++) {
6461 if (sortby) {
6462 robj *byval;
6463
3305306f 6464 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 6465 if (!byval || byval->type != REDIS_STRING) continue;
6466 if (alpha) {
9d65a1bb 6467 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 6468 } else {
942a3961 6469 if (byval->encoding == REDIS_ENCODING_RAW) {
6470 vector[j].u.score = strtod(byval->ptr,NULL);
6471 } else {
9d65a1bb 6472 /* Don't need to decode the object if it's
6473 * integer-encoded (the only encoding supported) so
6474 * far. We can just cast it */
f1017b3f 6475 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 6476 vector[j].u.score = (long)byval->ptr;
f1017b3f 6477 } else
dfc5e96c 6478 redisAssert(1 != 1);
942a3961 6479 }
ed9b544e 6480 }
6481 } else {
942a3961 6482 if (!alpha) {
6483 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6484 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6485 else {
6486 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6487 vector[j].u.score = (long) vector[j].obj->ptr;
6488 else
dfc5e96c 6489 redisAssert(1 != 1);
942a3961 6490 }
6491 }
ed9b544e 6492 }
6493 }
6494 }
6495
6496 /* We are ready to sort the vector... perform a bit of sanity check
6497 * on the LIMIT option too. We'll use a partial version of quicksort. */
6498 start = (limit_start < 0) ? 0 : limit_start;
6499 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6500 if (start >= vectorlen) {
6501 start = vectorlen-1;
6502 end = vectorlen-2;
6503 }
6504 if (end >= vectorlen) end = vectorlen-1;
6505
6506 if (dontsort == 0) {
6507 server.sort_desc = desc;
6508 server.sort_alpha = alpha;
6509 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 6510 if (sortby && (start != 0 || end != vectorlen-1))
6511 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6512 else
6513 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 6514 }
6515
6516 /* Send command output to the output buffer, performing the specified
6517 * GET/DEL/INCR/DECR operations if any. */
6518 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 6519 if (storekey == NULL) {
6520 /* STORE option not specified, sent the sorting result to client */
6521 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6522 for (j = start; j <= end; j++) {
6523 listNode *ln;
c7df85a4 6524 listIter li;
6525
dd88747b 6526 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 6527 listRewind(operations,&li);
6528 while((ln = listNext(&li))) {
443c6409 6529 redisSortOperation *sop = ln->value;
6530 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6531 vector[j].obj);
6532
6533 if (sop->type == REDIS_SORT_GET) {
6534 if (!val || val->type != REDIS_STRING) {
6535 addReply(c,shared.nullbulk);
6536 } else {
dd88747b 6537 addReplyBulk(c,val);
443c6409 6538 }
6539 } else {
dfc5e96c 6540 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 6541 }
6542 }
ed9b544e 6543 }
443c6409 6544 } else {
6545 robj *listObject = createListObject();
6546 list *listPtr = (list*) listObject->ptr;
6547
6548 /* STORE option specified, set the sorting result as a List object */
6549 for (j = start; j <= end; j++) {
6550 listNode *ln;
c7df85a4 6551 listIter li;
6552
443c6409 6553 if (!getop) {
6554 listAddNodeTail(listPtr,vector[j].obj);
6555 incrRefCount(vector[j].obj);
6556 }
c7df85a4 6557 listRewind(operations,&li);
6558 while((ln = listNext(&li))) {
443c6409 6559 redisSortOperation *sop = ln->value;
6560 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6561 vector[j].obj);
6562
6563 if (sop->type == REDIS_SORT_GET) {
6564 if (!val || val->type != REDIS_STRING) {
6565 listAddNodeTail(listPtr,createStringObject("",0));
6566 } else {
6567 listAddNodeTail(listPtr,val);
6568 incrRefCount(val);
6569 }
ed9b544e 6570 } else {
dfc5e96c 6571 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 6572 }
ed9b544e 6573 }
ed9b544e 6574 }
121796f7 6575 if (dictReplace(c->db->dict,storekey,listObject)) {
6576 incrRefCount(storekey);
6577 }
443c6409 6578 /* Note: we add 1 because the DB is dirty anyway since even if the
6579 * SORT result is empty a new key is set and maybe the old content
6580 * replaced. */
6581 server.dirty += 1+outputlen;
6582 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 6583 }
6584
6585 /* Cleanup */
6586 decrRefCount(sortval);
6587 listRelease(operations);
6588 for (j = 0; j < vectorlen; j++) {
6589 if (sortby && alpha && vector[j].u.cmpobj)
6590 decrRefCount(vector[j].u.cmpobj);
6591 }
6592 zfree(vector);
6593}
6594
ec6c7a1d 6595/* Convert an amount of bytes into a human readable string in the form
6596 * of 100B, 2G, 100M, 4K, and so forth. */
6597static void bytesToHuman(char *s, unsigned long long n) {
6598 double d;
6599
6600 if (n < 1024) {
6601 /* Bytes */
6602 sprintf(s,"%lluB",n);
6603 return;
6604 } else if (n < (1024*1024)) {
6605 d = (double)n/(1024);
6606 sprintf(s,"%.2fK",d);
6607 } else if (n < (1024LL*1024*1024)) {
6608 d = (double)n/(1024*1024);
6609 sprintf(s,"%.2fM",d);
6610 } else if (n < (1024LL*1024*1024*1024)) {
6611 d = (double)n/(1024LL*1024*1024);
b72f6a4b 6612 sprintf(s,"%.2fG",d);
ec6c7a1d 6613 }
6614}
6615
1c85b79f 6616/* Create the string returned by the INFO command. This is decoupled
6617 * by the INFO command itself as we need to report the same information
6618 * on memory corruption problems. */
6619static sds genRedisInfoString(void) {
ed9b544e 6620 sds info;
6621 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 6622 int j;
ec6c7a1d 6623 char hmem[64];
55a8298f 6624
b72f6a4b 6625 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 6626 info = sdscatprintf(sdsempty(),
6627 "redis_version:%s\r\n"
f1017b3f 6628 "arch_bits:%s\r\n"
7a932b74 6629 "multiplexing_api:%s\r\n"
0d7170a4 6630 "process_id:%ld\r\n"
682ac724 6631 "uptime_in_seconds:%ld\r\n"
6632 "uptime_in_days:%ld\r\n"
ed9b544e 6633 "connected_clients:%d\r\n"
6634 "connected_slaves:%d\r\n"
f86a74e9 6635 "blocked_clients:%d\r\n"
5fba9f71 6636 "used_memory:%zu\r\n"
ec6c7a1d 6637 "used_memory_human:%s\r\n"
ed9b544e 6638 "changes_since_last_save:%lld\r\n"
be2bb6b0 6639 "bgsave_in_progress:%d\r\n"
682ac724 6640 "last_save_time:%ld\r\n"
b3fad521 6641 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 6642 "total_connections_received:%lld\r\n"
6643 "total_commands_processed:%lld\r\n"
2a6a2ed1 6644 "expired_keys:%lld\r\n"
55a8298f 6645 "hash_max_zipmap_entries:%ld\r\n"
6646 "hash_max_zipmap_value:%ld\r\n"
7d98e08c 6647 "vm_enabled:%d\r\n"
a0f643ea 6648 "role:%s\r\n"
ed9b544e 6649 ,REDIS_VERSION,
f1017b3f 6650 (sizeof(long) == 8) ? "64" : "32",
7a932b74 6651 aeGetApiName(),
0d7170a4 6652 (long) getpid(),
a0f643ea 6653 uptime,
6654 uptime/(3600*24),
ed9b544e 6655 listLength(server.clients)-listLength(server.slaves),
6656 listLength(server.slaves),
d5d55fc3 6657 server.blpop_blocked_clients,
b72f6a4b 6658 zmalloc_used_memory(),
ec6c7a1d 6659 hmem,
ed9b544e 6660 server.dirty,
9d65a1bb 6661 server.bgsavechildpid != -1,
ed9b544e 6662 server.lastsave,
b3fad521 6663 server.bgrewritechildpid != -1,
ed9b544e 6664 server.stat_numconnections,
6665 server.stat_numcommands,
2a6a2ed1 6666 server.stat_expiredkeys,
55a8298f 6667 server.hash_max_zipmap_entries,
6668 server.hash_max_zipmap_value,
7d98e08c 6669 server.vm_enabled != 0,
a0f643ea 6670 server.masterhost == NULL ? "master" : "slave"
ed9b544e 6671 );
a0f643ea 6672 if (server.masterhost) {
6673 info = sdscatprintf(info,
6674 "master_host:%s\r\n"
6675 "master_port:%d\r\n"
6676 "master_link_status:%s\r\n"
6677 "master_last_io_seconds_ago:%d\r\n"
6678 ,server.masterhost,
6679 server.masterport,
6680 (server.replstate == REDIS_REPL_CONNECTED) ?
6681 "up" : "down",
f72b934d 6682 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 6683 );
6684 }
7d98e08c 6685 if (server.vm_enabled) {
1064ef87 6686 lockThreadedIO();
7d98e08c 6687 info = sdscatprintf(info,
6688 "vm_conf_max_memory:%llu\r\n"
6689 "vm_conf_page_size:%llu\r\n"
6690 "vm_conf_pages:%llu\r\n"
6691 "vm_stats_used_pages:%llu\r\n"
6692 "vm_stats_swapped_objects:%llu\r\n"
6693 "vm_stats_swappin_count:%llu\r\n"
6694 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 6695 "vm_stats_io_newjobs_len:%lu\r\n"
6696 "vm_stats_io_processing_len:%lu\r\n"
6697 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 6698 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 6699 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 6700 ,(unsigned long long) server.vm_max_memory,
6701 (unsigned long long) server.vm_page_size,
6702 (unsigned long long) server.vm_pages,
6703 (unsigned long long) server.vm_stats_used_pages,
6704 (unsigned long long) server.vm_stats_swapped_objects,
6705 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 6706 (unsigned long long) server.vm_stats_swapouts,
6707 (unsigned long) listLength(server.io_newjobs),
6708 (unsigned long) listLength(server.io_processing),
6709 (unsigned long) listLength(server.io_processed),
d5d55fc3 6710 (unsigned long) server.io_active_threads,
6711 (unsigned long) server.vm_blocked_clients
7d98e08c 6712 );
1064ef87 6713 unlockThreadedIO();
7d98e08c 6714 }
c3cb078d 6715 for (j = 0; j < server.dbnum; j++) {
6716 long long keys, vkeys;
6717
6718 keys = dictSize(server.db[j].dict);
6719 vkeys = dictSize(server.db[j].expires);
6720 if (keys || vkeys) {
9d65a1bb 6721 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 6722 j, keys, vkeys);
6723 }
6724 }
1c85b79f 6725 return info;
6726}
6727
6728static void infoCommand(redisClient *c) {
6729 sds info = genRedisInfoString();
83c6a618 6730 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6731 (unsigned long)sdslen(info)));
ed9b544e 6732 addReplySds(c,info);
70003d28 6733 addReply(c,shared.crlf);
ed9b544e 6734}
6735
3305306f 6736static void monitorCommand(redisClient *c) {
6737 /* ignore MONITOR if aleady slave or in monitor mode */
6738 if (c->flags & REDIS_SLAVE) return;
6739
6740 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6741 c->slaveseldb = 0;
6b47e12e 6742 listAddNodeTail(server.monitors,c);
3305306f 6743 addReply(c,shared.ok);
6744}
6745
6746/* ================================= Expire ================================= */
6747static int removeExpire(redisDb *db, robj *key) {
6748 if (dictDelete(db->expires,key) == DICT_OK) {
6749 return 1;
6750 } else {
6751 return 0;
6752 }
6753}
6754
6755static int setExpire(redisDb *db, robj *key, time_t when) {
6756 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6757 return 0;
6758 } else {
6759 incrRefCount(key);
6760 return 1;
6761 }
6762}
6763
bb32ede5 6764/* Return the expire time of the specified key, or -1 if no expire
6765 * is associated with this key (i.e. the key is non volatile) */
6766static time_t getExpire(redisDb *db, robj *key) {
6767 dictEntry *de;
6768
6769 /* No expire? return ASAP */
6770 if (dictSize(db->expires) == 0 ||
6771 (de = dictFind(db->expires,key)) == NULL) return -1;
6772
6773 return (time_t) dictGetEntryVal(de);
6774}
6775
3305306f 6776static int expireIfNeeded(redisDb *db, robj *key) {
6777 time_t when;
6778 dictEntry *de;
6779
6780 /* No expire? return ASAP */
6781 if (dictSize(db->expires) == 0 ||
6782 (de = dictFind(db->expires,key)) == NULL) return 0;
6783
6784 /* Lookup the expire */
6785 when = (time_t) dictGetEntryVal(de);
6786 if (time(NULL) <= when) return 0;
6787
6788 /* Delete the key */
6789 dictDelete(db->expires,key);
2a6a2ed1 6790 server.stat_expiredkeys++;
3305306f 6791 return dictDelete(db->dict,key) == DICT_OK;
6792}
6793
6794static int deleteIfVolatile(redisDb *db, robj *key) {
6795 dictEntry *de;
6796
6797 /* No expire? return ASAP */
6798 if (dictSize(db->expires) == 0 ||
6799 (de = dictFind(db->expires,key)) == NULL) return 0;
6800
6801 /* Delete the key */
0c66a471 6802 server.dirty++;
2a6a2ed1 6803 server.stat_expiredkeys++;
3305306f 6804 dictDelete(db->expires,key);
6805 return dictDelete(db->dict,key) == DICT_OK;
6806}
6807
802e8373 6808static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
3305306f 6809 dictEntry *de;
3305306f 6810
802e8373 6811 de = dictFind(c->db->dict,key);
3305306f 6812 if (de == NULL) {
6813 addReply(c,shared.czero);
6814 return;
6815 }
43e5ccdf 6816 if (seconds < 0) {
6817 if (deleteKey(c->db,key)) server.dirty++;
6818 addReply(c, shared.cone);
3305306f 6819 return;
6820 } else {
6821 time_t when = time(NULL)+seconds;
802e8373 6822 if (setExpire(c->db,key,when)) {
3305306f 6823 addReply(c,shared.cone);
77423026 6824 server.dirty++;
6825 } else {
3305306f 6826 addReply(c,shared.czero);
77423026 6827 }
3305306f 6828 return;
6829 }
6830}
6831
802e8373 6832static void expireCommand(redisClient *c) {
6833 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6834}
6835
6836static void expireatCommand(redisClient *c) {
6837 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6838}
6839
fd88489a 6840static void ttlCommand(redisClient *c) {
6841 time_t expire;
6842 int ttl = -1;
6843
6844 expire = getExpire(c->db,c->argv[1]);
6845 if (expire != -1) {
6846 ttl = (int) (expire-time(NULL));
6847 if (ttl < 0) ttl = -1;
6848 }
6849 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6850}
6851
6e469882 6852/* ================================ MULTI/EXEC ============================== */
6853
6854/* Client state initialization for MULTI/EXEC */
6855static void initClientMultiState(redisClient *c) {
6856 c->mstate.commands = NULL;
6857 c->mstate.count = 0;
6858}
6859
6860/* Release all the resources associated with MULTI/EXEC state */
6861static void freeClientMultiState(redisClient *c) {
6862 int j;
6863
6864 for (j = 0; j < c->mstate.count; j++) {
6865 int i;
6866 multiCmd *mc = c->mstate.commands+j;
6867
6868 for (i = 0; i < mc->argc; i++)
6869 decrRefCount(mc->argv[i]);
6870 zfree(mc->argv);
6871 }
6872 zfree(c->mstate.commands);
6873}
6874
6875/* Add a new command into the MULTI commands queue */
6876static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6877 multiCmd *mc;
6878 int j;
6879
6880 c->mstate.commands = zrealloc(c->mstate.commands,
6881 sizeof(multiCmd)*(c->mstate.count+1));
6882 mc = c->mstate.commands+c->mstate.count;
6883 mc->cmd = cmd;
6884 mc->argc = c->argc;
6885 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6886 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6887 for (j = 0; j < c->argc; j++)
6888 incrRefCount(mc->argv[j]);
6889 c->mstate.count++;
6890}
6891
6892static void multiCommand(redisClient *c) {
6893 c->flags |= REDIS_MULTI;
36c548f0 6894 addReply(c,shared.ok);
6e469882 6895}
6896
18b6cb76
DJ
6897static void discardCommand(redisClient *c) {
6898 if (!(c->flags & REDIS_MULTI)) {
6899 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6900 return;
6901 }
6902
6903 freeClientMultiState(c);
6904 initClientMultiState(c);
6905 c->flags &= (~REDIS_MULTI);
6906 addReply(c,shared.ok);
6907}
6908
6e469882 6909static void execCommand(redisClient *c) {
6910 int j;
6911 robj **orig_argv;
6912 int orig_argc;
6913
6914 if (!(c->flags & REDIS_MULTI)) {
6915 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6916 return;
6917 }
6918
6919 orig_argv = c->argv;
6920 orig_argc = c->argc;
6921 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6922 for (j = 0; j < c->mstate.count; j++) {
6923 c->argc = c->mstate.commands[j].argc;
6924 c->argv = c->mstate.commands[j].argv;
6925 call(c,c->mstate.commands[j].cmd);
6926 }
6927 c->argv = orig_argv;
6928 c->argc = orig_argc;
6929 freeClientMultiState(c);
6930 initClientMultiState(c);
6931 c->flags &= (~REDIS_MULTI);
6932}
6933
4409877e 6934/* =========================== Blocking Operations ========================= */
6935
6936/* Currently Redis blocking operations support is limited to list POP ops,
6937 * so the current implementation is not fully generic, but it is also not
6938 * completely specific so it will not require a rewrite to support new
6939 * kind of blocking operations in the future.
6940 *
6941 * Still it's important to note that list blocking operations can be already
6942 * used as a notification mechanism in order to implement other blocking
6943 * operations at application level, so there must be a very strong evidence
6944 * of usefulness and generality before new blocking operations are implemented.
6945 *
6946 * This is how the current blocking POP works, we use BLPOP as example:
6947 * - If the user calls BLPOP and the key exists and contains a non empty list
6948 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6949 * if there is not to block.
6950 * - If instead BLPOP is called and the key does not exists or the list is
6951 * empty we need to block. In order to do so we remove the notification for
6952 * new data to read in the client socket (so that we'll not serve new
6953 * requests if the blocking request is not served). Also we put the client
95242ab5 6954 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 6955 * blocking for this keys.
6956 * - If a PUSH operation against a key with blocked clients waiting is
6957 * performed, we serve the first in the list: basically instead to push
6958 * the new element inside the list we return it to the (first / oldest)
6959 * blocking client, unblock the client, and remove it form the list.
6960 *
6961 * The above comment and the source code should be enough in order to understand
6962 * the implementation and modify / fix it later.
6963 */
6964
6965/* Set a client in blocking mode for the specified key, with the specified
6966 * timeout */
b177fd30 6967static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 6968 dictEntry *de;
6969 list *l;
b177fd30 6970 int j;
4409877e 6971
b177fd30 6972 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6973 c->blockingkeysnum = numkeys;
4409877e 6974 c->blockingto = timeout;
b177fd30 6975 for (j = 0; j < numkeys; j++) {
6976 /* Add the key in the client structure, to map clients -> keys */
6977 c->blockingkeys[j] = keys[j];
6978 incrRefCount(keys[j]);
4409877e 6979
b177fd30 6980 /* And in the other "side", to map keys -> clients */
6981 de = dictFind(c->db->blockingkeys,keys[j]);
6982 if (de == NULL) {
6983 int retval;
6984
6985 /* For every key we take a list of clients blocked for it */
6986 l = listCreate();
6987 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6988 incrRefCount(keys[j]);
6989 assert(retval == DICT_OK);
6990 } else {
6991 l = dictGetEntryVal(de);
6992 }
6993 listAddNodeTail(l,c);
4409877e 6994 }
b177fd30 6995 /* Mark the client as a blocked client */
4409877e 6996 c->flags |= REDIS_BLOCKED;
d5d55fc3 6997 server.blpop_blocked_clients++;
4409877e 6998}
6999
7000/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7001static void unblockClientWaitingData(redisClient *c) {
4409877e 7002 dictEntry *de;
7003 list *l;
b177fd30 7004 int j;
4409877e 7005
b177fd30 7006 assert(c->blockingkeys != NULL);
7007 /* The client may wait for multiple keys, so unblock it for every key. */
7008 for (j = 0; j < c->blockingkeysnum; j++) {
7009 /* Remove this client from the list of clients waiting for this key. */
7010 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7011 assert(de != NULL);
7012 l = dictGetEntryVal(de);
7013 listDelNode(l,listSearchKey(l,c));
7014 /* If the list is empty we need to remove it to avoid wasting memory */
7015 if (listLength(l) == 0)
7016 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7017 decrRefCount(c->blockingkeys[j]);
7018 }
7019 /* Cleanup the client structure */
7020 zfree(c->blockingkeys);
7021 c->blockingkeys = NULL;
4409877e 7022 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7023 server.blpop_blocked_clients--;
5921aa36 7024 /* We want to process data if there is some command waiting
b0d8747d 7025 * in the input buffer. Note that this is safe even if
7026 * unblockClientWaitingData() gets called from freeClient() because
7027 * freeClient() will be smart enough to call this function
7028 * *after* c->querybuf was set to NULL. */
4409877e 7029 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7030}
7031
7032/* This should be called from any function PUSHing into lists.
7033 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7034 * 'ele' is the element pushed.
7035 *
7036 * If the function returns 0 there was no client waiting for a list push
7037 * against this key.
7038 *
7039 * If the function returns 1 there was a client waiting for a list push
7040 * against this key, the element was passed to this client thus it's not
7041 * needed to actually add it to the list and the caller should return asap. */
7042static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7043 struct dictEntry *de;
7044 redisClient *receiver;
7045 list *l;
7046 listNode *ln;
7047
7048 de = dictFind(c->db->blockingkeys,key);
7049 if (de == NULL) return 0;
7050 l = dictGetEntryVal(de);
7051 ln = listFirst(l);
7052 assert(ln != NULL);
7053 receiver = ln->value;
4409877e 7054
b177fd30 7055 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7056 addReplyBulk(receiver,key);
7057 addReplyBulk(receiver,ele);
b0d8747d 7058 unblockClientWaitingData(receiver);
4409877e 7059 return 1;
7060}
7061
7062/* Blocking RPOP/LPOP */
7063static void blockingPopGenericCommand(redisClient *c, int where) {
7064 robj *o;
7065 time_t timeout;
b177fd30 7066 int j;
4409877e 7067
b177fd30 7068 for (j = 1; j < c->argc-1; j++) {
7069 o = lookupKeyWrite(c->db,c->argv[j]);
7070 if (o != NULL) {
7071 if (o->type != REDIS_LIST) {
7072 addReply(c,shared.wrongtypeerr);
4409877e 7073 return;
b177fd30 7074 } else {
7075 list *list = o->ptr;
7076 if (listLength(list) != 0) {
7077 /* If the list contains elements fall back to the usual
7078 * non-blocking POP operation */
7079 robj *argv[2], **orig_argv;
7080 int orig_argc;
7081
7082 /* We need to alter the command arguments before to call
7083 * popGenericCommand() as the command takes a single key. */
7084 orig_argv = c->argv;
7085 orig_argc = c->argc;
7086 argv[1] = c->argv[j];
7087 c->argv = argv;
7088 c->argc = 2;
7089
7090 /* Also the return value is different, we need to output
7091 * the multi bulk reply header and the key name. The
7092 * "real" command will add the last element (the value)
7093 * for us. If this souds like an hack to you it's just
7094 * because it is... */
7095 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7096 addReplyBulk(c,argv[1]);
b177fd30 7097 popGenericCommand(c,where);
7098
7099 /* Fix the client structure with the original stuff */
7100 c->argv = orig_argv;
7101 c->argc = orig_argc;
7102 return;
7103 }
4409877e 7104 }
7105 }
7106 }
7107 /* If the list is empty or the key does not exists we must block */
b177fd30 7108 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7109 if (timeout > 0) timeout += time(NULL);
b177fd30 7110 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7111}
7112
7113static void blpopCommand(redisClient *c) {
7114 blockingPopGenericCommand(c,REDIS_HEAD);
7115}
7116
7117static void brpopCommand(redisClient *c) {
7118 blockingPopGenericCommand(c,REDIS_TAIL);
7119}
7120
ed9b544e 7121/* =============================== Replication ============================= */
7122
a4d1ba9a 7123static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7124 ssize_t nwritten, ret = size;
7125 time_t start = time(NULL);
7126
7127 timeout++;
7128 while(size) {
7129 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7130 nwritten = write(fd,ptr,size);
7131 if (nwritten == -1) return -1;
7132 ptr += nwritten;
7133 size -= nwritten;
7134 }
7135 if ((time(NULL)-start) > timeout) {
7136 errno = ETIMEDOUT;
7137 return -1;
7138 }
7139 }
7140 return ret;
7141}
7142
a4d1ba9a 7143static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7144 ssize_t nread, totread = 0;
7145 time_t start = time(NULL);
7146
7147 timeout++;
7148 while(size) {
7149 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7150 nread = read(fd,ptr,size);
7151 if (nread == -1) return -1;
7152 ptr += nread;
7153 size -= nread;
7154 totread += nread;
7155 }
7156 if ((time(NULL)-start) > timeout) {
7157 errno = ETIMEDOUT;
7158 return -1;
7159 }
7160 }
7161 return totread;
7162}
7163
7164static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7165 ssize_t nread = 0;
7166
7167 size--;
7168 while(size) {
7169 char c;
7170
7171 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7172 if (c == '\n') {
7173 *ptr = '\0';
7174 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7175 return nread;
7176 } else {
7177 *ptr++ = c;
7178 *ptr = '\0';
7179 nread++;
7180 }
7181 }
7182 return nread;
7183}
7184
7185static void syncCommand(redisClient *c) {
40d224a9 7186 /* ignore SYNC if aleady slave or in monitor mode */
7187 if (c->flags & REDIS_SLAVE) return;
7188
7189 /* SYNC can't be issued when the server has pending data to send to
7190 * the client about already issued commands. We need a fresh reply
7191 * buffer registering the differences between the BGSAVE and the current
7192 * dataset, so that we can copy to other slaves if needed. */
7193 if (listLength(c->reply) != 0) {
7194 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7195 return;
7196 }
7197
7198 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7199 /* Here we need to check if there is a background saving operation
7200 * in progress, or if it is required to start one */
9d65a1bb 7201 if (server.bgsavechildpid != -1) {
40d224a9 7202 /* Ok a background save is in progress. Let's check if it is a good
7203 * one for replication, i.e. if there is another slave that is
7204 * registering differences since the server forked to save */
7205 redisClient *slave;
7206 listNode *ln;
c7df85a4 7207 listIter li;
40d224a9 7208
c7df85a4 7209 listRewind(server.slaves,&li);
7210 while((ln = listNext(&li))) {
40d224a9 7211 slave = ln->value;
7212 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7213 }
7214 if (ln) {
7215 /* Perfect, the server is already registering differences for
7216 * another slave. Set the right state, and copy the buffer. */
7217 listRelease(c->reply);
7218 c->reply = listDup(slave->reply);
40d224a9 7219 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7220 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7221 } else {
7222 /* No way, we need to wait for the next BGSAVE in order to
7223 * register differences */
7224 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7225 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7226 }
7227 } else {
7228 /* Ok we don't have a BGSAVE in progress, let's start one */
7229 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7230 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7231 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7232 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7233 return;
7234 }
7235 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7236 }
6208b3a7 7237 c->repldbfd = -1;
40d224a9 7238 c->flags |= REDIS_SLAVE;
7239 c->slaveseldb = 0;
6b47e12e 7240 listAddNodeTail(server.slaves,c);
40d224a9 7241 return;
7242}
7243
6208b3a7 7244static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7245 redisClient *slave = privdata;
7246 REDIS_NOTUSED(el);
7247 REDIS_NOTUSED(mask);
7248 char buf[REDIS_IOBUF_LEN];
7249 ssize_t nwritten, buflen;
7250
7251 if (slave->repldboff == 0) {
7252 /* Write the bulk write count before to transfer the DB. In theory here
7253 * we don't know how much room there is in the output buffer of the
7254 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7255 * operations) will never be smaller than the few bytes we need. */
7256 sds bulkcount;
7257
7258 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7259 slave->repldbsize);
7260 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7261 {
7262 sdsfree(bulkcount);
7263 freeClient(slave);
7264 return;
7265 }
7266 sdsfree(bulkcount);
7267 }
7268 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7269 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7270 if (buflen <= 0) {
7271 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7272 (buflen == 0) ? "premature EOF" : strerror(errno));
7273 freeClient(slave);
7274 return;
7275 }
7276 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7277 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7278 strerror(errno));
7279 freeClient(slave);
7280 return;
7281 }
7282 slave->repldboff += nwritten;
7283 if (slave->repldboff == slave->repldbsize) {
7284 close(slave->repldbfd);
7285 slave->repldbfd = -1;
7286 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7287 slave->replstate = REDIS_REPL_ONLINE;
7288 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7289 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7290 freeClient(slave);
7291 return;
7292 }
7293 addReplySds(slave,sdsempty());
7294 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7295 }
7296}
ed9b544e 7297
a3b21203 7298/* This function is called at the end of every backgrond saving.
7299 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7300 * otherwise REDIS_ERR is passed to the function.
7301 *
7302 * The goal of this function is to handle slaves waiting for a successful
7303 * background saving in order to perform non-blocking synchronization. */
7304static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7305 listNode *ln;
7306 int startbgsave = 0;
c7df85a4 7307 listIter li;
ed9b544e 7308
c7df85a4 7309 listRewind(server.slaves,&li);
7310 while((ln = listNext(&li))) {
6208b3a7 7311 redisClient *slave = ln->value;
ed9b544e 7312
6208b3a7 7313 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7314 startbgsave = 1;
7315 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7316 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7317 struct redis_stat buf;
6208b3a7 7318
7319 if (bgsaveerr != REDIS_OK) {
7320 freeClient(slave);
7321 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7322 continue;
7323 }
7324 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7325 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7326 freeClient(slave);
7327 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7328 continue;
7329 }
7330 slave->repldboff = 0;
7331 slave->repldbsize = buf.st_size;
7332 slave->replstate = REDIS_REPL_SEND_BULK;
7333 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7334 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7335 freeClient(slave);
7336 continue;
7337 }
7338 }
ed9b544e 7339 }
6208b3a7 7340 if (startbgsave) {
7341 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7342 listIter li;
7343
7344 listRewind(server.slaves,&li);
6208b3a7 7345 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7346 while((ln = listNext(&li))) {
6208b3a7 7347 redisClient *slave = ln->value;
ed9b544e 7348
6208b3a7 7349 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7350 freeClient(slave);
7351 }
7352 }
7353 }
ed9b544e 7354}
7355
7356static int syncWithMaster(void) {
d0ccebcf 7357 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7358 long dumpsize;
ed9b544e 7359 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7360 int dfd, maxtries = 5;
ed9b544e 7361
7362 if (fd == -1) {
7363 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7364 strerror(errno));
7365 return REDIS_ERR;
7366 }
d0ccebcf 7367
7368 /* AUTH with the master if required. */
7369 if(server.masterauth) {
7370 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7371 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7372 close(fd);
7373 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7374 strerror(errno));
7375 return REDIS_ERR;
7376 }
7377 /* Read the AUTH result. */
7378 if (syncReadLine(fd,buf,1024,3600) == -1) {
7379 close(fd);
7380 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7381 strerror(errno));
7382 return REDIS_ERR;
7383 }
7384 if (buf[0] != '+') {
7385 close(fd);
7386 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7387 return REDIS_ERR;
7388 }
7389 }
7390
ed9b544e 7391 /* Issue the SYNC command */
7392 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7393 close(fd);
7394 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7395 strerror(errno));
7396 return REDIS_ERR;
7397 }
7398 /* Read the bulk write count */
8c4d91fc 7399 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7400 close(fd);
7401 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7402 strerror(errno));
7403 return REDIS_ERR;
7404 }
4aa701c1 7405 if (buf[0] != '$') {
7406 close(fd);
7407 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7408 return REDIS_ERR;
7409 }
18e61fa2 7410 dumpsize = strtol(buf+1,NULL,10);
7411 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7412 /* Read the bulk write data on a temp file */
8c5abee8 7413 while(maxtries--) {
7414 snprintf(tmpfile,256,
7415 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7416 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7417 if (dfd != -1) break;
5de9ad7c 7418 sleep(1);
8c5abee8 7419 }
ed9b544e 7420 if (dfd == -1) {
7421 close(fd);
7422 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7423 return REDIS_ERR;
7424 }
7425 while(dumpsize) {
7426 int nread, nwritten;
7427
7428 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7429 if (nread == -1) {
7430 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7431 strerror(errno));
7432 close(fd);
7433 close(dfd);
7434 return REDIS_ERR;
7435 }
7436 nwritten = write(dfd,buf,nread);
7437 if (nwritten == -1) {
7438 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7439 close(fd);
7440 close(dfd);
7441 return REDIS_ERR;
7442 }
7443 dumpsize -= nread;
7444 }
7445 close(dfd);
7446 if (rename(tmpfile,server.dbfilename) == -1) {
7447 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7448 unlink(tmpfile);
7449 close(fd);
7450 return REDIS_ERR;
7451 }
7452 emptyDb();
f78fd11b 7453 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 7454 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7455 close(fd);
7456 return REDIS_ERR;
7457 }
7458 server.master = createClient(fd);
7459 server.master->flags |= REDIS_MASTER;
179b3952 7460 server.master->authenticated = 1;
ed9b544e 7461 server.replstate = REDIS_REPL_CONNECTED;
7462 return REDIS_OK;
7463}
7464
321b0e13 7465static void slaveofCommand(redisClient *c) {
7466 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7467 !strcasecmp(c->argv[2]->ptr,"one")) {
7468 if (server.masterhost) {
7469 sdsfree(server.masterhost);
7470 server.masterhost = NULL;
7471 if (server.master) freeClient(server.master);
7472 server.replstate = REDIS_REPL_NONE;
7473 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7474 }
7475 } else {
7476 sdsfree(server.masterhost);
7477 server.masterhost = sdsdup(c->argv[1]->ptr);
7478 server.masterport = atoi(c->argv[2]->ptr);
7479 if (server.master) freeClient(server.master);
7480 server.replstate = REDIS_REPL_CONNECT;
7481 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7482 server.masterhost, server.masterport);
7483 }
7484 addReply(c,shared.ok);
7485}
7486
3fd78bcd 7487/* ============================ Maxmemory directive ======================== */
7488
a5819310 7489/* Try to free one object form the pre-allocated objects free list.
7490 * This is useful under low mem conditions as by default we take 1 million
7491 * free objects allocated. On success REDIS_OK is returned, otherwise
7492 * REDIS_ERR. */
7493static int tryFreeOneObjectFromFreelist(void) {
f870935d 7494 robj *o;
7495
a5819310 7496 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7497 if (listLength(server.objfreelist)) {
7498 listNode *head = listFirst(server.objfreelist);
7499 o = listNodeValue(head);
7500 listDelNode(server.objfreelist,head);
7501 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7502 zfree(o);
7503 return REDIS_OK;
7504 } else {
7505 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7506 return REDIS_ERR;
7507 }
f870935d 7508}
7509
3fd78bcd 7510/* This function gets called when 'maxmemory' is set on the config file to limit
7511 * the max memory used by the server, and we are out of memory.
7512 * This function will try to, in order:
7513 *
7514 * - Free objects from the free list
7515 * - Try to remove keys with an EXPIRE set
7516 *
7517 * It is not possible to free enough memory to reach used-memory < maxmemory
7518 * the server will start refusing commands that will enlarge even more the
7519 * memory usage.
7520 */
7521static void freeMemoryIfNeeded(void) {
7522 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 7523 int j, k, freed = 0;
7524
7525 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7526 for (j = 0; j < server.dbnum; j++) {
7527 int minttl = -1;
7528 robj *minkey = NULL;
7529 struct dictEntry *de;
7530
7531 if (dictSize(server.db[j].expires)) {
7532 freed = 1;
7533 /* From a sample of three keys drop the one nearest to
7534 * the natural expire */
7535 for (k = 0; k < 3; k++) {
7536 time_t t;
7537
7538 de = dictGetRandomKey(server.db[j].expires);
7539 t = (time_t) dictGetEntryVal(de);
7540 if (minttl == -1 || t < minttl) {
7541 minkey = dictGetEntryKey(de);
7542 minttl = t;
3fd78bcd 7543 }
3fd78bcd 7544 }
a5819310 7545 deleteKey(server.db+j,minkey);
3fd78bcd 7546 }
3fd78bcd 7547 }
a5819310 7548 if (!freed) return; /* nothing to free... */
3fd78bcd 7549 }
7550}
7551
f80dff62 7552/* ============================== Append Only file ========================== */
7553
7554static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7555 sds buf = sdsempty();
7556 int j;
7557 ssize_t nwritten;
7558 time_t now;
7559 robj *tmpargv[3];
7560
7561 /* The DB this command was targetting is not the same as the last command
7562 * we appendend. To issue a SELECT command is needed. */
7563 if (dictid != server.appendseldb) {
7564 char seldb[64];
7565
7566 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 7567 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 7568 (unsigned long)strlen(seldb),seldb);
f80dff62 7569 server.appendseldb = dictid;
7570 }
7571
7572 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7573 * EXPIREs into EXPIREATs calls */
7574 if (cmd->proc == expireCommand) {
7575 long when;
7576
7577 tmpargv[0] = createStringObject("EXPIREAT",8);
7578 tmpargv[1] = argv[1];
7579 incrRefCount(argv[1]);
7580 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7581 tmpargv[2] = createObject(REDIS_STRING,
7582 sdscatprintf(sdsempty(),"%ld",when));
7583 argv = tmpargv;
7584 }
7585
7586 /* Append the actual command */
7587 buf = sdscatprintf(buf,"*%d\r\n",argc);
7588 for (j = 0; j < argc; j++) {
7589 robj *o = argv[j];
7590
9d65a1bb 7591 o = getDecodedObject(o);
83c6a618 7592 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 7593 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7594 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 7595 decrRefCount(o);
f80dff62 7596 }
7597
7598 /* Free the objects from the modified argv for EXPIREAT */
7599 if (cmd->proc == expireCommand) {
7600 for (j = 0; j < 3; j++)
7601 decrRefCount(argv[j]);
7602 }
7603
7604 /* We want to perform a single write. This should be guaranteed atomic
7605 * at least if the filesystem we are writing is a real physical one.
7606 * While this will save us against the server being killed I don't think
7607 * there is much to do about the whole server stopping for power problems
7608 * or alike */
7609 nwritten = write(server.appendfd,buf,sdslen(buf));
7610 if (nwritten != (signed)sdslen(buf)) {
7611 /* Ooops, we are in troubles. The best thing to do for now is
7612 * to simply exit instead to give the illusion that everything is
7613 * working as expected. */
7614 if (nwritten == -1) {
7615 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7616 } else {
7617 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7618 }
7619 exit(1);
7620 }
85a83172 7621 /* If a background append only file rewriting is in progress we want to
7622 * accumulate the differences between the child DB and the current one
7623 * in a buffer, so that when the child process will do its work we
7624 * can append the differences to the new append only file. */
7625 if (server.bgrewritechildpid != -1)
7626 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7627
7628 sdsfree(buf);
f80dff62 7629 now = time(NULL);
7630 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7631 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7632 now-server.lastfsync > 1))
7633 {
7634 fsync(server.appendfd); /* Let's try to get this data on the disk */
7635 server.lastfsync = now;
7636 }
7637}
7638
7639/* In Redis commands are always executed in the context of a client, so in
7640 * order to load the append only file we need to create a fake client. */
7641static struct redisClient *createFakeClient(void) {
7642 struct redisClient *c = zmalloc(sizeof(*c));
7643
7644 selectDb(c,0);
7645 c->fd = -1;
7646 c->querybuf = sdsempty();
7647 c->argc = 0;
7648 c->argv = NULL;
7649 c->flags = 0;
9387d17d 7650 /* We set the fake client as a slave waiting for the synchronization
7651 * so that Redis will not try to send replies to this client. */
7652 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 7653 c->reply = listCreate();
7654 listSetFreeMethod(c->reply,decrRefCount);
7655 listSetDupMethod(c->reply,dupClientReplyValue);
7656 return c;
7657}
7658
7659static void freeFakeClient(struct redisClient *c) {
7660 sdsfree(c->querybuf);
7661 listRelease(c->reply);
7662 zfree(c);
7663}
7664
7665/* Replay the append log file. On error REDIS_OK is returned. On non fatal
7666 * error (the append only file is zero-length) REDIS_ERR is returned. On
7667 * fatal error an error message is logged and the program exists. */
7668int loadAppendOnlyFile(char *filename) {
7669 struct redisClient *fakeClient;
7670 FILE *fp = fopen(filename,"r");
7671 struct redis_stat sb;
b492cf00 7672 unsigned long long loadedkeys = 0;
f80dff62 7673
7674 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7675 return REDIS_ERR;
7676
7677 if (fp == NULL) {
7678 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7679 exit(1);
7680 }
7681
7682 fakeClient = createFakeClient();
7683 while(1) {
7684 int argc, j;
7685 unsigned long len;
7686 robj **argv;
7687 char buf[128];
7688 sds argsds;
7689 struct redisCommand *cmd;
7690
7691 if (fgets(buf,sizeof(buf),fp) == NULL) {
7692 if (feof(fp))
7693 break;
7694 else
7695 goto readerr;
7696 }
7697 if (buf[0] != '*') goto fmterr;
7698 argc = atoi(buf+1);
7699 argv = zmalloc(sizeof(robj*)*argc);
7700 for (j = 0; j < argc; j++) {
7701 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7702 if (buf[0] != '$') goto fmterr;
7703 len = strtol(buf+1,NULL,10);
7704 argsds = sdsnewlen(NULL,len);
0f151ef1 7705 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 7706 argv[j] = createObject(REDIS_STRING,argsds);
7707 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7708 }
7709
7710 /* Command lookup */
7711 cmd = lookupCommand(argv[0]->ptr);
7712 if (!cmd) {
7713 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7714 exit(1);
7715 }
7716 /* Try object sharing and encoding */
7717 if (server.shareobjects) {
7718 int j;
7719 for(j = 1; j < argc; j++)
7720 argv[j] = tryObjectSharing(argv[j]);
7721 }
7722 if (cmd->flags & REDIS_CMD_BULK)
7723 tryObjectEncoding(argv[argc-1]);
7724 /* Run the command in the context of a fake client */
7725 fakeClient->argc = argc;
7726 fakeClient->argv = argv;
7727 cmd->proc(fakeClient);
7728 /* Discard the reply objects list from the fake client */
7729 while(listLength(fakeClient->reply))
7730 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7731 /* Clean up, ready for the next command */
7732 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7733 zfree(argv);
b492cf00 7734 /* Handle swapping while loading big datasets when VM is on */
7735 loadedkeys++;
7736 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7737 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 7738 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 7739 }
7740 }
f80dff62 7741 }
7742 fclose(fp);
7743 freeFakeClient(fakeClient);
7744 return REDIS_OK;
7745
7746readerr:
7747 if (feof(fp)) {
7748 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7749 } else {
7750 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7751 }
7752 exit(1);
7753fmterr:
7754 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7755 exit(1);
7756}
7757
9d65a1bb 7758/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 7759static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 7760 char buf[128];
b9bc0eef 7761 int decrrc = 0;
7762
f2d9f50f 7763 /* Avoid the incr/decr ref count business if possible to help
7764 * copy-on-write (we are often in a child process when this function
7765 * is called).
7766 * Also makes sure that key objects don't get incrRefCount-ed when VM
7767 * is enabled */
7768 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 7769 obj = getDecodedObject(obj);
7770 decrrc = 1;
7771 }
9d65a1bb 7772 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7773 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 7774 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7775 goto err;
9d65a1bb 7776 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 7777 if (decrrc) decrRefCount(obj);
9d65a1bb 7778 return 1;
7779err:
b9bc0eef 7780 if (decrrc) decrRefCount(obj);
9d65a1bb 7781 return 0;
7782}
7783
9c8e3cee 7784/* Write binary-safe string into a file in the bulkformat
7785 * $<count>\r\n<payload>\r\n */
7786static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7787 char buf[128];
7788
7789 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7790 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7791 if (len && fwrite(s,len,1,fp) == 0) return 0;
7792 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7793 return 1;
7794}
7795
9d65a1bb 7796/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7797static int fwriteBulkDouble(FILE *fp, double d) {
7798 char buf[128], dbuf[128];
7799
7800 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7801 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7802 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7803 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7804 return 1;
7805}
7806
7807/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7808static int fwriteBulkLong(FILE *fp, long l) {
7809 char buf[128], lbuf[128];
7810
7811 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7812 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7813 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7814 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7815 return 1;
7816}
7817
7818/* Write a sequence of commands able to fully rebuild the dataset into
7819 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7820static int rewriteAppendOnlyFile(char *filename) {
7821 dictIterator *di = NULL;
7822 dictEntry *de;
7823 FILE *fp;
7824 char tmpfile[256];
7825 int j;
7826 time_t now = time(NULL);
7827
7828 /* Note that we have to use a different temp name here compared to the
7829 * one used by rewriteAppendOnlyFileBackground() function. */
7830 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7831 fp = fopen(tmpfile,"w");
7832 if (!fp) {
7833 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7834 return REDIS_ERR;
7835 }
7836 for (j = 0; j < server.dbnum; j++) {
7837 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7838 redisDb *db = server.db+j;
7839 dict *d = db->dict;
7840 if (dictSize(d) == 0) continue;
7841 di = dictGetIterator(d);
7842 if (!di) {
7843 fclose(fp);
7844 return REDIS_ERR;
7845 }
7846
7847 /* SELECT the new DB */
7848 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 7849 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 7850
7851 /* Iterate this DB writing every entry */
7852 while((de = dictNext(di)) != NULL) {
e7546c63 7853 robj *key, *o;
7854 time_t expiretime;
7855 int swapped;
7856
7857 key = dictGetEntryKey(de);
b9bc0eef 7858 /* If the value for this key is swapped, load a preview in memory.
7859 * We use a "swapped" flag to remember if we need to free the
7860 * value object instead to just increment the ref count anyway
7861 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 7862 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7863 key->storage == REDIS_VM_SWAPPING) {
e7546c63 7864 o = dictGetEntryVal(de);
7865 swapped = 0;
7866 } else {
7867 o = vmPreviewObject(key);
e7546c63 7868 swapped = 1;
7869 }
7870 expiretime = getExpire(db,key);
9d65a1bb 7871
7872 /* Save the key and associated value */
9d65a1bb 7873 if (o->type == REDIS_STRING) {
7874 /* Emit a SET command */
7875 char cmd[]="*3\r\n$3\r\nSET\r\n";
7876 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7877 /* Key and value */
9c8e3cee 7878 if (fwriteBulkObject(fp,key) == 0) goto werr;
7879 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 7880 } else if (o->type == REDIS_LIST) {
7881 /* Emit the RPUSHes needed to rebuild the list */
7882 list *list = o->ptr;
7883 listNode *ln;
c7df85a4 7884 listIter li;
9d65a1bb 7885
c7df85a4 7886 listRewind(list,&li);
7887 while((ln = listNext(&li))) {
9d65a1bb 7888 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7889 robj *eleobj = listNodeValue(ln);
7890
7891 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7892 if (fwriteBulkObject(fp,key) == 0) goto werr;
7893 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7894 }
7895 } else if (o->type == REDIS_SET) {
7896 /* Emit the SADDs needed to rebuild the set */
7897 dict *set = o->ptr;
7898 dictIterator *di = dictGetIterator(set);
7899 dictEntry *de;
7900
7901 while((de = dictNext(di)) != NULL) {
7902 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7903 robj *eleobj = dictGetEntryKey(de);
7904
7905 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7906 if (fwriteBulkObject(fp,key) == 0) goto werr;
7907 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7908 }
7909 dictReleaseIterator(di);
7910 } else if (o->type == REDIS_ZSET) {
7911 /* Emit the ZADDs needed to rebuild the sorted set */
7912 zset *zs = o->ptr;
7913 dictIterator *di = dictGetIterator(zs->dict);
7914 dictEntry *de;
7915
7916 while((de = dictNext(di)) != NULL) {
7917 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7918 robj *eleobj = dictGetEntryKey(de);
7919 double *score = dictGetEntryVal(de);
7920
7921 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7922 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 7923 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 7924 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7925 }
7926 dictReleaseIterator(di);
9c8e3cee 7927 } else if (o->type == REDIS_HASH) {
7928 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7929
7930 /* Emit the HSETs needed to rebuild the hash */
7931 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7932 unsigned char *p = zipmapRewind(o->ptr);
7933 unsigned char *field, *val;
7934 unsigned int flen, vlen;
7935
7936 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7937 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7938 if (fwriteBulkObject(fp,key) == 0) goto werr;
7939 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7940 return -1;
7941 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7942 return -1;
7943 }
7944 } else {
7945 dictIterator *di = dictGetIterator(o->ptr);
7946 dictEntry *de;
7947
7948 while((de = dictNext(di)) != NULL) {
7949 robj *field = dictGetEntryKey(de);
7950 robj *val = dictGetEntryVal(de);
7951
7952 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7953 if (fwriteBulkObject(fp,key) == 0) goto werr;
7954 if (fwriteBulkObject(fp,field) == -1) return -1;
7955 if (fwriteBulkObject(fp,val) == -1) return -1;
7956 }
7957 dictReleaseIterator(di);
7958 }
9d65a1bb 7959 } else {
78409a0f 7960 redisAssert(0);
9d65a1bb 7961 }
7962 /* Save the expire time */
7963 if (expiretime != -1) {
e96e4fbf 7964 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 7965 /* If this key is already expired skip it */
7966 if (expiretime < now) continue;
7967 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7968 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 7969 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7970 }
b9bc0eef 7971 if (swapped) decrRefCount(o);
9d65a1bb 7972 }
7973 dictReleaseIterator(di);
7974 }
7975
7976 /* Make sure data will not remain on the OS's output buffers */
7977 fflush(fp);
7978 fsync(fileno(fp));
7979 fclose(fp);
7980
7981 /* Use RENAME to make sure the DB file is changed atomically only
7982 * if the generate DB file is ok. */
7983 if (rename(tmpfile,filename) == -1) {
7984 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7985 unlink(tmpfile);
7986 return REDIS_ERR;
7987 }
7988 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7989 return REDIS_OK;
7990
7991werr:
7992 fclose(fp);
7993 unlink(tmpfile);
e96e4fbf 7994 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 7995 if (di) dictReleaseIterator(di);
7996 return REDIS_ERR;
7997}
7998
7999/* This is how rewriting of the append only file in background works:
8000 *
8001 * 1) The user calls BGREWRITEAOF
8002 * 2) Redis calls this function, that forks():
8003 * 2a) the child rewrite the append only file in a temp file.
8004 * 2b) the parent accumulates differences in server.bgrewritebuf.
8005 * 3) When the child finished '2a' exists.
8006 * 4) The parent will trap the exit code, if it's OK, will append the
8007 * data accumulated into server.bgrewritebuf into the temp file, and
8008 * finally will rename(2) the temp file in the actual file name.
8009 * The the new file is reopened as the new append only file. Profit!
8010 */
8011static int rewriteAppendOnlyFileBackground(void) {
8012 pid_t childpid;
8013
8014 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8015 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8016 if ((childpid = fork()) == 0) {
8017 /* Child */
8018 char tmpfile[256];
9d65a1bb 8019
054e426d 8020 if (server.vm_enabled) vmReopenSwapFile();
8021 close(server.fd);
9d65a1bb 8022 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8023 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8024 _exit(0);
9d65a1bb 8025 } else {
478c2c6f 8026 _exit(1);
9d65a1bb 8027 }
8028 } else {
8029 /* Parent */
8030 if (childpid == -1) {
8031 redisLog(REDIS_WARNING,
8032 "Can't rewrite append only file in background: fork: %s",
8033 strerror(errno));
8034 return REDIS_ERR;
8035 }
8036 redisLog(REDIS_NOTICE,
8037 "Background append only file rewriting started by pid %d",childpid);
8038 server.bgrewritechildpid = childpid;
85a83172 8039 /* We set appendseldb to -1 in order to force the next call to the
8040 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8041 * accumulated by the parent into server.bgrewritebuf will start
8042 * with a SELECT statement and it will be safe to merge. */
8043 server.appendseldb = -1;
9d65a1bb 8044 return REDIS_OK;
8045 }
8046 return REDIS_OK; /* unreached */
8047}
8048
8049static void bgrewriteaofCommand(redisClient *c) {
8050 if (server.bgrewritechildpid != -1) {
8051 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8052 return;
8053 }
8054 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8055 char *status = "+Background append only file rewriting started\r\n";
8056 addReplySds(c,sdsnew(status));
9d65a1bb 8057 } else {
8058 addReply(c,shared.err);
8059 }
8060}
8061
8062static void aofRemoveTempFile(pid_t childpid) {
8063 char tmpfile[256];
8064
8065 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8066 unlink(tmpfile);
8067}
8068
996cb5f7 8069/* Virtual Memory is composed mainly of two subsystems:
8070 * - Blocking Virutal Memory
8071 * - Threaded Virtual Memory I/O
8072 * The two parts are not fully decoupled, but functions are split among two
8073 * different sections of the source code (delimited by comments) in order to
8074 * make more clear what functionality is about the blocking VM and what about
8075 * the threaded (not blocking) VM.
8076 *
8077 * Redis VM design:
8078 *
8079 * Redis VM is a blocking VM (one that blocks reading swapped values from
8080 * disk into memory when a value swapped out is needed in memory) that is made
8081 * unblocking by trying to examine the command argument vector in order to
8082 * load in background values that will likely be needed in order to exec
8083 * the command. The command is executed only once all the relevant keys
8084 * are loaded into memory.
8085 *
8086 * This basically is almost as simple of a blocking VM, but almost as parallel
8087 * as a fully non-blocking VM.
8088 */
8089
8090/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8091
8092/* substitute the first occurrence of '%p' with the process pid in the
8093 * swap file name. */
8094static void expandVmSwapFilename(void) {
8095 char *p = strstr(server.vm_swap_file,"%p");
8096 sds new;
8097
8098 if (!p) return;
8099 new = sdsempty();
8100 *p = '\0';
8101 new = sdscat(new,server.vm_swap_file);
8102 new = sdscatprintf(new,"%ld",(long) getpid());
8103 new = sdscat(new,p+2);
8104 zfree(server.vm_swap_file);
8105 server.vm_swap_file = new;
8106}
8107
75680a3c 8108static void vmInit(void) {
8109 off_t totsize;
996cb5f7 8110 int pipefds[2];
bcaa7a4f 8111 size_t stacksize;
75680a3c 8112
4ad37480 8113 if (server.vm_max_threads != 0)
8114 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8115
054e426d 8116 expandVmSwapFilename();
8117 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 8118 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8119 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8120 }
75680a3c 8121 if (server.vm_fp == NULL) {
6fa987e3 8122 redisLog(REDIS_WARNING,
8123 "Impossible to open the swap file: %s. Exiting.",
8124 strerror(errno));
75680a3c 8125 exit(1);
8126 }
8127 server.vm_fd = fileno(server.vm_fp);
8128 server.vm_next_page = 0;
8129 server.vm_near_pages = 0;
7d98e08c 8130 server.vm_stats_used_pages = 0;
8131 server.vm_stats_swapped_objects = 0;
8132 server.vm_stats_swapouts = 0;
8133 server.vm_stats_swapins = 0;
75680a3c 8134 totsize = server.vm_pages*server.vm_page_size;
8135 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8136 if (ftruncate(server.vm_fd,totsize) == -1) {
8137 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8138 strerror(errno));
8139 exit(1);
8140 } else {
8141 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8142 }
7d30035d 8143 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8144 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8145 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8146 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8147
996cb5f7 8148 /* Initialize threaded I/O (used by Virtual Memory) */
8149 server.io_newjobs = listCreate();
8150 server.io_processing = listCreate();
8151 server.io_processed = listCreate();
d5d55fc3 8152 server.io_ready_clients = listCreate();
92f8e882 8153 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8154 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8155 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8156 server.io_active_threads = 0;
996cb5f7 8157 if (pipe(pipefds) == -1) {
8158 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8159 ,strerror(errno));
8160 exit(1);
8161 }
8162 server.io_ready_pipe_read = pipefds[0];
8163 server.io_ready_pipe_write = pipefds[1];
8164 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8165 /* LZF requires a lot of stack */
8166 pthread_attr_init(&server.io_threads_attr);
8167 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8168 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8169 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8170 /* Listen for events in the threaded I/O pipe */
8171 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8172 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8173 oom("creating file event");
75680a3c 8174}
8175
06224fec 8176/* Mark the page as used */
8177static void vmMarkPageUsed(off_t page) {
8178 off_t byte = page/8;
8179 int bit = page&7;
970e10bb 8180 redisAssert(vmFreePage(page) == 1);
06224fec 8181 server.vm_bitmap[byte] |= 1<<bit;
8182}
8183
8184/* Mark N contiguous pages as used, with 'page' being the first. */
8185static void vmMarkPagesUsed(off_t page, off_t count) {
8186 off_t j;
8187
8188 for (j = 0; j < count; j++)
7d30035d 8189 vmMarkPageUsed(page+j);
7d98e08c 8190 server.vm_stats_used_pages += count;
7c775e09 8191 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8192 (long long)count, (long long)page);
06224fec 8193}
8194
8195/* Mark the page as free */
8196static void vmMarkPageFree(off_t page) {
8197 off_t byte = page/8;
8198 int bit = page&7;
970e10bb 8199 redisAssert(vmFreePage(page) == 0);
06224fec 8200 server.vm_bitmap[byte] &= ~(1<<bit);
8201}
8202
8203/* Mark N contiguous pages as free, with 'page' being the first. */
8204static void vmMarkPagesFree(off_t page, off_t count) {
8205 off_t j;
8206
8207 for (j = 0; j < count; j++)
7d30035d 8208 vmMarkPageFree(page+j);
7d98e08c 8209 server.vm_stats_used_pages -= count;
7c775e09 8210 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8211 (long long)count, (long long)page);
06224fec 8212}
8213
8214/* Test if the page is free */
8215static int vmFreePage(off_t page) {
8216 off_t byte = page/8;
8217 int bit = page&7;
7d30035d 8218 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8219}
8220
8221/* Find N contiguous free pages storing the first page of the cluster in *first.
3a66edc7 8222 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8223 * REDIS_ERR is returned.
06224fec 8224 *
8225 * This function uses a simple algorithm: we try to allocate
8226 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8227 * again from the start of the swap file searching for free spaces.
8228 *
8229 * If it looks pretty clear that there are no free pages near our offset
8230 * we try to find less populated places doing a forward jump of
8231 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8232 * without hurry, and then we jump again and so forth...
8233 *
8234 * This function can be improved using a free list to avoid to guess
8235 * too much, since we could collect data about freed pages.
8236 *
8237 * note: I implemented this function just after watching an episode of
8238 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8239 */
c7df85a4 8240static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8241 off_t base, offset = 0, since_jump = 0, numfree = 0;
8242
8243 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8244 server.vm_near_pages = 0;
8245 server.vm_next_page = 0;
8246 }
8247 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8248 base = server.vm_next_page;
8249
8250 while(offset < server.vm_pages) {
8251 off_t this = base+offset;
8252
8253 /* If we overflow, restart from page zero */
8254 if (this >= server.vm_pages) {
8255 this -= server.vm_pages;
8256 if (this == 0) {
8257 /* Just overflowed, what we found on tail is no longer
8258 * interesting, as it's no longer contiguous. */
8259 numfree = 0;
8260 }
8261 }
8262 if (vmFreePage(this)) {
8263 /* This is a free page */
8264 numfree++;
8265 /* Already got N free pages? Return to the caller, with success */
8266 if (numfree == n) {
7d30035d 8267 *first = this-(n-1);
8268 server.vm_next_page = this+1;
7c775e09 8269 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8270 return REDIS_OK;
06224fec 8271 }
8272 } else {
8273 /* The current one is not a free page */
8274 numfree = 0;
8275 }
8276
8277 /* Fast-forward if the current page is not free and we already
8278 * searched enough near this place. */
8279 since_jump++;
8280 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8281 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8282 since_jump = 0;
8283 /* Note that even if we rewind after the jump, we are don't need
8284 * to make sure numfree is set to zero as we only jump *if* it
8285 * is set to zero. */
8286 } else {
8287 /* Otherwise just check the next page */
8288 offset++;
8289 }
8290 }
3a66edc7 8291 return REDIS_ERR;
8292}
8293
a5819310 8294/* Write the specified object at the specified page of the swap file */
8295static int vmWriteObjectOnSwap(robj *o, off_t page) {
8296 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8297 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8298 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8299 redisLog(REDIS_WARNING,
9ebed7cf 8300 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8301 strerror(errno));
8302 return REDIS_ERR;
8303 }
8304 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8305 fflush(server.vm_fp);
a5819310 8306 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8307 return REDIS_OK;
8308}
8309
3a66edc7 8310/* Swap the 'val' object relative to 'key' into disk. Store all the information
8311 * needed to later retrieve the object into the key object.
8312 * If we can't find enough contiguous empty pages to swap the object on disk
8313 * REDIS_ERR is returned. */
a69a0c9c 8314static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8315 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8316 off_t page;
8317
8318 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8319 assert(key->refcount == 1);
3a66edc7 8320 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8321 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8322 key->vm.page = page;
8323 key->vm.usedpages = pages;
8324 key->storage = REDIS_VM_SWAPPED;
d894161b 8325 key->vtype = val->type;
3a66edc7 8326 decrRefCount(val); /* Deallocate the object from memory. */
8327 vmMarkPagesUsed(page,pages);
7d30035d 8328 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8329 (unsigned char*) key->ptr,
8330 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8331 server.vm_stats_swapped_objects++;
8332 server.vm_stats_swapouts++;
3a66edc7 8333 return REDIS_OK;
8334}
8335
a5819310 8336static robj *vmReadObjectFromSwap(off_t page, int type) {
8337 robj *o;
3a66edc7 8338
a5819310 8339 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8340 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8341 redisLog(REDIS_WARNING,
d5d55fc3 8342 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8343 strerror(errno));
478c2c6f 8344 _exit(1);
3a66edc7 8345 }
a5819310 8346 o = rdbLoadObject(type,server.vm_fp);
8347 if (o == NULL) {
d5d55fc3 8348 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8349 _exit(1);
3a66edc7 8350 }
a5819310 8351 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8352 return o;
8353}
8354
8355/* Load the value object relative to the 'key' object from swap to memory.
8356 * The newly allocated object is returned.
8357 *
8358 * If preview is true the unserialized object is returned to the caller but
8359 * no changes are made to the key object, nor the pages are marked as freed */
8360static robj *vmGenericLoadObject(robj *key, int preview) {
8361 robj *val;
8362
d5d55fc3 8363 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8364 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8365 if (!preview) {
8366 key->storage = REDIS_VM_MEMORY;
8367 key->vm.atime = server.unixtime;
8368 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8369 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8370 (unsigned char*) key->ptr);
7d98e08c 8371 server.vm_stats_swapped_objects--;
38aba9a1 8372 } else {
8373 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8374 (unsigned char*) key->ptr);
7e69548d 8375 }
7d98e08c 8376 server.vm_stats_swapins++;
3a66edc7 8377 return val;
06224fec 8378}
8379
7e69548d 8380/* Plain object loading, from swap to memory */
8381static robj *vmLoadObject(robj *key) {
996cb5f7 8382 /* If we are loading the object in background, stop it, we
8383 * need to load this object synchronously ASAP. */
8384 if (key->storage == REDIS_VM_LOADING)
8385 vmCancelThreadedIOJob(key);
7e69548d 8386 return vmGenericLoadObject(key,0);
8387}
8388
8389/* Just load the value on disk, without to modify the key.
8390 * This is useful when we want to perform some operation on the value
8391 * without to really bring it from swap to memory, like while saving the
8392 * dataset or rewriting the append only log. */
8393static robj *vmPreviewObject(robj *key) {
8394 return vmGenericLoadObject(key,1);
8395}
8396
4ef8de8a 8397/* How a good candidate is this object for swapping?
8398 * The better candidate it is, the greater the returned value.
8399 *
8400 * Currently we try to perform a fast estimation of the object size in
8401 * memory, and combine it with aging informations.
8402 *
8403 * Basically swappability = idle-time * log(estimated size)
8404 *
8405 * Bigger objects are preferred over smaller objects, but not
8406 * proportionally, this is why we use the logarithm. This algorithm is
8407 * just a first try and will probably be tuned later. */
8408static double computeObjectSwappability(robj *o) {
8409 time_t age = server.unixtime - o->vm.atime;
8410 long asize = 0;
8411 list *l;
8412 dict *d;
8413 struct dictEntry *de;
8414 int z;
8415
8416 if (age <= 0) return 0;
8417 switch(o->type) {
8418 case REDIS_STRING:
8419 if (o->encoding != REDIS_ENCODING_RAW) {
8420 asize = sizeof(*o);
8421 } else {
8422 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8423 }
8424 break;
8425 case REDIS_LIST:
8426 l = o->ptr;
8427 listNode *ln = listFirst(l);
8428
8429 asize = sizeof(list);
8430 if (ln) {
8431 robj *ele = ln->value;
8432 long elesize;
8433
8434 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8435 (sizeof(*o)+sdslen(ele->ptr)) :
8436 sizeof(*o);
8437 asize += (sizeof(listNode)+elesize)*listLength(l);
8438 }
8439 break;
8440 case REDIS_SET:
8441 case REDIS_ZSET:
8442 z = (o->type == REDIS_ZSET);
8443 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8444
8445 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8446 if (z) asize += sizeof(zset)-sizeof(dict);
8447 if (dictSize(d)) {
8448 long elesize;
8449 robj *ele;
8450
8451 de = dictGetRandomKey(d);
8452 ele = dictGetEntryKey(de);
8453 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8454 (sizeof(*o)+sdslen(ele->ptr)) :
8455 sizeof(*o);
8456 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8457 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8458 }
8459 break;
a97b9060 8460 case REDIS_HASH:
8461 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8462 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8463 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8464 unsigned int klen, vlen;
8465 unsigned char *key, *val;
8466
8467 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8468 klen = 0;
8469 vlen = 0;
8470 }
8471 asize = len*(klen+vlen+3);
8472 } else if (o->encoding == REDIS_ENCODING_HT) {
8473 d = o->ptr;
8474 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8475 if (dictSize(d)) {
8476 long elesize;
8477 robj *ele;
8478
8479 de = dictGetRandomKey(d);
8480 ele = dictGetEntryKey(de);
8481 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8482 (sizeof(*o)+sdslen(ele->ptr)) :
8483 sizeof(*o);
8484 ele = dictGetEntryVal(de);
8485 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8486 (sizeof(*o)+sdslen(ele->ptr)) :
8487 sizeof(*o);
8488 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8489 }
8490 }
8491 break;
4ef8de8a 8492 }
c8c72447 8493 return (double)age*log(1+asize);
4ef8de8a 8494}
8495
8496/* Try to swap an object that's a good candidate for swapping.
8497 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 8498 * to swap any object at all.
8499 *
8500 * If 'usethreaded' is true, Redis will try to swap the object in background
8501 * using I/O threads. */
8502static int vmSwapOneObject(int usethreads) {
4ef8de8a 8503 int j, i;
8504 struct dictEntry *best = NULL;
8505 double best_swappability = 0;
b9bc0eef 8506 redisDb *best_db = NULL;
4ef8de8a 8507 robj *key, *val;
8508
8509 for (j = 0; j < server.dbnum; j++) {
8510 redisDb *db = server.db+j;
b72f6a4b 8511 /* Why maxtries is set to 100?
8512 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8513 * are swappable objects */
b0d8747d 8514 int maxtries = 100;
4ef8de8a 8515
8516 if (dictSize(db->dict) == 0) continue;
8517 for (i = 0; i < 5; i++) {
8518 dictEntry *de;
8519 double swappability;
8520
e3cadb8a 8521 if (maxtries) maxtries--;
4ef8de8a 8522 de = dictGetRandomKey(db->dict);
8523 key = dictGetEntryKey(de);
8524 val = dictGetEntryVal(de);
1064ef87 8525 /* Only swap objects that are currently in memory.
8526 *
8527 * Also don't swap shared objects if threaded VM is on, as we
8528 * try to ensure that the main thread does not touch the
8529 * object while the I/O thread is using it, but we can't
8530 * control other keys without adding additional mutex. */
8531 if (key->storage != REDIS_VM_MEMORY ||
8532 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 8533 if (maxtries) i--; /* don't count this try */
8534 continue;
8535 }
4ef8de8a 8536 swappability = computeObjectSwappability(val);
8537 if (!best || swappability > best_swappability) {
8538 best = de;
8539 best_swappability = swappability;
b9bc0eef 8540 best_db = db;
4ef8de8a 8541 }
8542 }
8543 }
7c775e09 8544 if (best == NULL) return REDIS_ERR;
4ef8de8a 8545 key = dictGetEntryKey(best);
8546 val = dictGetEntryVal(best);
8547
e3cadb8a 8548 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 8549 key->ptr, best_swappability);
8550
8551 /* Unshare the key if needed */
8552 if (key->refcount > 1) {
8553 robj *newkey = dupStringObject(key);
8554 decrRefCount(key);
8555 key = dictGetEntryKey(best) = newkey;
8556 }
8557 /* Swap it */
a69a0c9c 8558 if (usethreads) {
b9bc0eef 8559 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 8560 return REDIS_OK;
8561 } else {
a69a0c9c 8562 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8563 dictGetEntryVal(best) = NULL;
8564 return REDIS_OK;
8565 } else {
8566 return REDIS_ERR;
8567 }
4ef8de8a 8568 }
8569}
8570
a69a0c9c 8571static int vmSwapOneObjectBlocking() {
8572 return vmSwapOneObject(0);
8573}
8574
8575static int vmSwapOneObjectThreaded() {
8576 return vmSwapOneObject(1);
8577}
8578
7e69548d 8579/* Return true if it's safe to swap out objects in a given moment.
8580 * Basically we don't want to swap objects out while there is a BGSAVE
8581 * or a BGAEOREWRITE running in backgroud. */
8582static int vmCanSwapOut(void) {
8583 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8584}
8585
1b03836c 8586/* Delete a key if swapped. Returns 1 if the key was found, was swapped
8587 * and was deleted. Otherwise 0 is returned. */
8588static int deleteIfSwapped(redisDb *db, robj *key) {
8589 dictEntry *de;
8590 robj *foundkey;
8591
8592 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8593 foundkey = dictGetEntryKey(de);
8594 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8595 deleteKey(db,key);
8596 return 1;
8597}
8598
996cb5f7 8599/* =================== Virtual Memory - Threaded I/O ======================= */
8600
b9bc0eef 8601static void freeIOJob(iojob *j) {
d5d55fc3 8602 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8603 j->type == REDIS_IOJOB_DO_SWAP ||
8604 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 8605 decrRefCount(j->val);
8606 decrRefCount(j->key);
8607 zfree(j);
8608}
8609
996cb5f7 8610/* Every time a thread finished a Job, it writes a byte into the write side
8611 * of an unix pipe in order to "awake" the main thread, and this function
8612 * is called. */
8613static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8614 int mask)
8615{
8616 char buf[1];
b0d8747d 8617 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 8618 REDIS_NOTUSED(el);
8619 REDIS_NOTUSED(mask);
8620 REDIS_NOTUSED(privdata);
8621
8622 /* For every byte we read in the read side of the pipe, there is one
8623 * I/O job completed to process. */
8624 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 8625 iojob *j;
8626 listNode *ln;
8627 robj *key;
8628 struct dictEntry *de;
8629
996cb5f7 8630 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 8631
8632 /* Get the processed element (the oldest one) */
8633 lockThreadedIO();
1064ef87 8634 assert(listLength(server.io_processed) != 0);
f6c0bba8 8635 if (toprocess == -1) {
8636 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8637 if (toprocess <= 0) toprocess = 1;
8638 }
b9bc0eef 8639 ln = listFirst(server.io_processed);
8640 j = ln->value;
8641 listDelNode(server.io_processed,ln);
8642 unlockThreadedIO();
8643 /* If this job is marked as canceled, just ignore it */
8644 if (j->canceled) {
8645 freeIOJob(j);
8646 continue;
8647 }
8648 /* Post process it in the main thread, as there are things we
8649 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 8650 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 8651 de = dictFind(j->db->dict,j->key);
8652 assert(de != NULL);
8653 key = dictGetEntryKey(de);
8654 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8655 redisDb *db;
8656
b9bc0eef 8657 /* Key loaded, bring it at home */
8658 key->storage = REDIS_VM_MEMORY;
8659 key->vm.atime = server.unixtime;
8660 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8661 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8662 (unsigned char*) key->ptr);
8663 server.vm_stats_swapped_objects--;
8664 server.vm_stats_swapins++;
d5d55fc3 8665 dictGetEntryVal(de) = j->val;
8666 incrRefCount(j->val);
8667 db = j->db;
b9bc0eef 8668 freeIOJob(j);
d5d55fc3 8669 /* Handle clients waiting for this key to be loaded. */
8670 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 8671 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8672 /* Now we know the amount of pages required to swap this object.
8673 * Let's find some space for it, and queue this task again
8674 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 8675 if (!vmCanSwapOut() ||
8676 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8677 {
8678 /* Ooops... no space or we can't swap as there is
8679 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 8680 freeIOJob(j);
054e426d 8681 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 8682 } else {
c7df85a4 8683 /* Note that we need to mark this pages as used now,
8684 * if the job will be canceled, we'll mark them as freed
8685 * again. */
8686 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 8687 j->type = REDIS_IOJOB_DO_SWAP;
8688 lockThreadedIO();
8689 queueIOJob(j);
8690 unlockThreadedIO();
8691 }
8692 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8693 robj *val;
8694
8695 /* Key swapped. We can finally free some memory. */
6c96ba7d 8696 if (key->storage != REDIS_VM_SWAPPING) {
8697 printf("key->storage: %d\n",key->storage);
8698 printf("key->name: %s\n",(char*)key->ptr);
8699 printf("key->refcount: %d\n",key->refcount);
8700 printf("val: %p\n",(void*)j->val);
8701 printf("val->type: %d\n",j->val->type);
8702 printf("val->ptr: %s\n",(char*)j->val->ptr);
8703 }
8704 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 8705 val = dictGetEntryVal(de);
8706 key->vm.page = j->page;
8707 key->vm.usedpages = j->pages;
8708 key->storage = REDIS_VM_SWAPPED;
8709 key->vtype = j->val->type;
8710 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 8711 dictGetEntryVal(de) = NULL;
b9bc0eef 8712 redisLog(REDIS_DEBUG,
8713 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8714 (unsigned char*) key->ptr,
8715 (unsigned long long) j->page, (unsigned long long) j->pages);
8716 server.vm_stats_swapped_objects++;
8717 server.vm_stats_swapouts++;
8718 freeIOJob(j);
f11b8647 8719 /* Put a few more swap requests in queue if we are still
8720 * out of memory */
b0d8747d 8721 if (trytoswap && vmCanSwapOut() &&
8722 zmalloc_used_memory() > server.vm_max_memory)
8723 {
f11b8647 8724 int more = 1;
8725 while(more) {
8726 lockThreadedIO();
8727 more = listLength(server.io_newjobs) <
8728 (unsigned) server.vm_max_threads;
8729 unlockThreadedIO();
8730 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 8731 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8732 trytoswap = 0;
8733 break;
8734 }
f11b8647 8735 }
8736 }
b9bc0eef 8737 }
c953f24b 8738 processed++;
f6c0bba8 8739 if (processed == toprocess) return;
996cb5f7 8740 }
8741 if (retval < 0 && errno != EAGAIN) {
8742 redisLog(REDIS_WARNING,
8743 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8744 strerror(errno));
8745 }
8746}
8747
8748static void lockThreadedIO(void) {
8749 pthread_mutex_lock(&server.io_mutex);
8750}
8751
8752static void unlockThreadedIO(void) {
8753 pthread_mutex_unlock(&server.io_mutex);
8754}
8755
8756/* Remove the specified object from the threaded I/O queue if still not
8757 * processed, otherwise make sure to flag it as canceled. */
8758static void vmCancelThreadedIOJob(robj *o) {
8759 list *lists[3] = {
6c96ba7d 8760 server.io_newjobs, /* 0 */
8761 server.io_processing, /* 1 */
8762 server.io_processed /* 2 */
996cb5f7 8763 };
8764 int i;
8765
8766 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 8767again:
996cb5f7 8768 lockThreadedIO();
8769 /* Search for a matching key in one of the queues */
8770 for (i = 0; i < 3; i++) {
8771 listNode *ln;
c7df85a4 8772 listIter li;
996cb5f7 8773
c7df85a4 8774 listRewind(lists[i],&li);
8775 while ((ln = listNext(&li)) != NULL) {
996cb5f7 8776 iojob *job = ln->value;
8777
6c96ba7d 8778 if (job->canceled) continue; /* Skip this, already canceled. */
996cb5f7 8779 if (compareStringObjects(job->key,o) == 0) {
970e10bb 8780 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8781 (void*)job, (char*)o->ptr, job->type, i);
427a2153 8782 /* Mark the pages as free since the swap didn't happened
8783 * or happened but is now discarded. */
970e10bb 8784 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 8785 vmMarkPagesFree(job->page,job->pages);
8786 /* Cancel the job. It depends on the list the job is
8787 * living in. */
996cb5f7 8788 switch(i) {
8789 case 0: /* io_newjobs */
6c96ba7d 8790 /* If the job was yet not processed the best thing to do
996cb5f7 8791 * is to remove it from the queue at all */
6c96ba7d 8792 freeIOJob(job);
996cb5f7 8793 listDelNode(lists[i],ln);
8794 break;
8795 case 1: /* io_processing */
d5d55fc3 8796 /* Oh Shi- the thread is messing with the Job:
8797 *
8798 * Probably it's accessing the object if this is a
8799 * PREPARE_SWAP or DO_SWAP job.
8800 * If it's a LOAD job it may be reading from disk and
8801 * if we don't wait for the job to terminate before to
8802 * cancel it, maybe in a few microseconds data can be
8803 * corrupted in this pages. So the short story is:
8804 *
8805 * Better to wait for the job to move into the
8806 * next queue (processed)... */
8807
8808 /* We try again and again until the job is completed. */
8809 unlockThreadedIO();
8810 /* But let's wait some time for the I/O thread
8811 * to finish with this job. After all this condition
8812 * should be very rare. */
8813 usleep(1);
8814 goto again;
996cb5f7 8815 case 2: /* io_processed */
2e111efe 8816 /* The job was already processed, that's easy...
8817 * just mark it as canceled so that we'll ignore it
8818 * when processing completed jobs. */
996cb5f7 8819 job->canceled = 1;
8820 break;
8821 }
c7df85a4 8822 /* Finally we have to adjust the storage type of the object
8823 * in order to "UNDO" the operaiton. */
996cb5f7 8824 if (o->storage == REDIS_VM_LOADING)
8825 o->storage = REDIS_VM_SWAPPED;
8826 else if (o->storage == REDIS_VM_SWAPPING)
8827 o->storage = REDIS_VM_MEMORY;
8828 unlockThreadedIO();
8829 return;
8830 }
8831 }
8832 }
8833 unlockThreadedIO();
8834 assert(1 != 1); /* We should never reach this */
8835}
8836
b9bc0eef 8837static void *IOThreadEntryPoint(void *arg) {
8838 iojob *j;
8839 listNode *ln;
8840 REDIS_NOTUSED(arg);
8841
8842 pthread_detach(pthread_self());
8843 while(1) {
8844 /* Get a new job to process */
8845 lockThreadedIO();
8846 if (listLength(server.io_newjobs) == 0) {
8847 /* No new jobs in queue, exit. */
9ebed7cf 8848 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8849 (long) pthread_self());
b9bc0eef 8850 server.io_active_threads--;
8851 unlockThreadedIO();
8852 return NULL;
8853 }
8854 ln = listFirst(server.io_newjobs);
8855 j = ln->value;
8856 listDelNode(server.io_newjobs,ln);
8857 /* Add the job in the processing queue */
8858 j->thread = pthread_self();
8859 listAddNodeTail(server.io_processing,j);
8860 ln = listLast(server.io_processing); /* We use ln later to remove it */
8861 unlockThreadedIO();
9ebed7cf 8862 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8863 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 8864
8865 /* Process the Job */
8866 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8867 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 8868 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8869 FILE *fp = fopen("/dev/null","w+");
8870 j->pages = rdbSavedObjectPages(j->val,fp);
8871 fclose(fp);
8872 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 8873 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8874 j->canceled = 1;
b9bc0eef 8875 }
8876
8877 /* Done: insert the job into the processed queue */
9ebed7cf 8878 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8879 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 8880 lockThreadedIO();
8881 listDelNode(server.io_processing,ln);
8882 listAddNodeTail(server.io_processed,j);
8883 unlockThreadedIO();
8884
8885 /* Signal the main thread there is new stuff to process */
8886 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8887 }
8888 return NULL; /* never reached */
8889}
8890
8891static void spawnIOThread(void) {
8892 pthread_t thread;
478c2c6f 8893 sigset_t mask, omask;
a97b9060 8894 int err;
b9bc0eef 8895
478c2c6f 8896 sigemptyset(&mask);
8897 sigaddset(&mask,SIGCHLD);
8898 sigaddset(&mask,SIGHUP);
8899 sigaddset(&mask,SIGPIPE);
8900 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 8901 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8902 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8903 strerror(err));
8904 usleep(1000000);
8905 }
478c2c6f 8906 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 8907 server.io_active_threads++;
8908}
8909
4ee9488d 8910/* We need to wait for the last thread to exit before we are able to
8911 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 8912static void waitEmptyIOJobsQueue(void) {
4ee9488d 8913 while(1) {
76b7233a 8914 int io_processed_len;
8915
4ee9488d 8916 lockThreadedIO();
054e426d 8917 if (listLength(server.io_newjobs) == 0 &&
8918 listLength(server.io_processing) == 0 &&
8919 server.io_active_threads == 0)
8920 {
4ee9488d 8921 unlockThreadedIO();
8922 return;
8923 }
76b7233a 8924 /* While waiting for empty jobs queue condition we post-process some
8925 * finshed job, as I/O threads may be hanging trying to write against
8926 * the io_ready_pipe_write FD but there are so much pending jobs that
8927 * it's blocking. */
8928 io_processed_len = listLength(server.io_processed);
4ee9488d 8929 unlockThreadedIO();
76b7233a 8930 if (io_processed_len) {
8931 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8932 usleep(1000); /* 1 millisecond */
8933 } else {
8934 usleep(10000); /* 10 milliseconds */
8935 }
4ee9488d 8936 }
8937}
8938
054e426d 8939static void vmReopenSwapFile(void) {
478c2c6f 8940 /* Note: we don't close the old one as we are in the child process
8941 * and don't want to mess at all with the original file object. */
054e426d 8942 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8943 if (server.vm_fp == NULL) {
8944 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8945 server.vm_swap_file);
478c2c6f 8946 _exit(1);
054e426d 8947 }
8948 server.vm_fd = fileno(server.vm_fp);
8949}
8950
b9bc0eef 8951/* This function must be called while with threaded IO locked */
8952static void queueIOJob(iojob *j) {
6c96ba7d 8953 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8954 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 8955 listAddNodeTail(server.io_newjobs,j);
8956 if (server.io_active_threads < server.vm_max_threads)
8957 spawnIOThread();
8958}
8959
8960static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8961 iojob *j;
8962
8963 assert(key->storage == REDIS_VM_MEMORY);
8964 assert(key->refcount == 1);
8965
8966 j = zmalloc(sizeof(*j));
8967 j->type = REDIS_IOJOB_PREPARE_SWAP;
8968 j->db = db;
8969 j->key = dupStringObject(key);
8970 j->val = val;
8971 incrRefCount(val);
8972 j->canceled = 0;
8973 j->thread = (pthread_t) -1;
f11b8647 8974 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 8975
8976 lockThreadedIO();
8977 queueIOJob(j);
8978 unlockThreadedIO();
8979 return REDIS_OK;
8980}
8981
b0d8747d 8982/* ============ Virtual Memory - Blocking clients on missing keys =========== */
8983
d5d55fc3 8984/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8985 * If there is not already a job loading the key, it is craeted.
8986 * The key is added to the io_keys list in the client structure, and also
8987 * in the hash table mapping swapped keys to waiting clients, that is,
8988 * server.io_waited_keys. */
8989static int waitForSwappedKey(redisClient *c, robj *key) {
8990 struct dictEntry *de;
8991 robj *o;
8992 list *l;
8993
8994 /* If the key does not exist or is already in RAM we don't need to
8995 * block the client at all. */
8996 de = dictFind(c->db->dict,key);
8997 if (de == NULL) return 0;
8998 o = dictGetEntryKey(de);
8999 if (o->storage == REDIS_VM_MEMORY) {
9000 return 0;
9001 } else if (o->storage == REDIS_VM_SWAPPING) {
9002 /* We were swapping the key, undo it! */
9003 vmCancelThreadedIOJob(o);
9004 return 0;
9005 }
9006
9007 /* OK: the key is either swapped, or being loaded just now. */
9008
9009 /* Add the key to the list of keys this client is waiting for.
9010 * This maps clients to keys they are waiting for. */
9011 listAddNodeTail(c->io_keys,key);
9012 incrRefCount(key);
9013
9014 /* Add the client to the swapped keys => clients waiting map. */
9015 de = dictFind(c->db->io_keys,key);
9016 if (de == NULL) {
9017 int retval;
9018
9019 /* For every key we take a list of clients blocked for it */
9020 l = listCreate();
9021 retval = dictAdd(c->db->io_keys,key,l);
9022 incrRefCount(key);
9023 assert(retval == DICT_OK);
9024 } else {
9025 l = dictGetEntryVal(de);
9026 }
9027 listAddNodeTail(l,c);
9028
9029 /* Are we already loading the key from disk? If not create a job */
9030 if (o->storage == REDIS_VM_SWAPPED) {
9031 iojob *j;
9032
9033 o->storage = REDIS_VM_LOADING;
9034 j = zmalloc(sizeof(*j));
9035 j->type = REDIS_IOJOB_LOAD;
9036 j->db = c->db;
9037 j->key = dupStringObject(key);
9038 j->key->vtype = o->vtype;
9039 j->page = o->vm.page;
9040 j->val = NULL;
9041 j->canceled = 0;
9042 j->thread = (pthread_t) -1;
9043 lockThreadedIO();
9044 queueIOJob(j);
9045 unlockThreadedIO();
9046 }
9047 return 1;
9048}
9049
76583ea4
PN
9050/* Preload keys needed for the ZUNION and ZINTER commands. */
9051static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9052 int i, num;
9053 num = atoi(c->argv[2]->ptr);
9054 for (i = 0; i < num; i++) {
9055 waitForSwappedKey(c,c->argv[3+i]);
9056 }
9057}
9058
b0d8747d 9059/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9060 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9061 *
d5d55fc3 9062 * The important idea about this function is that it can fail! If keys will
9063 * still be swapped when the client is resumed, this key lookups will
9064 * just block loading keys from disk. In practical terms this should only
9065 * happen with SORT BY command or if there is a bug in this function.
9066 *
9067 * Return 1 if the client is marked as blocked, 0 if the client can
9068 * continue as the keys it is going to access appear to be in memory. */
9069static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 9070 int j, last;
9071
76583ea4
PN
9072 if (cmd->vm_preload_proc != NULL) {
9073 cmd->vm_preload_proc(c);
9074 } else {
9075 if (cmd->vm_firstkey == 0) return 0;
9076 last = cmd->vm_lastkey;
9077 if (last < 0) last = c->argc+last;
9078 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9079 waitForSwappedKey(c,c->argv[j]);
9080 }
9081
d5d55fc3 9082 /* If the client was blocked for at least one key, mark it as blocked. */
9083 if (listLength(c->io_keys)) {
9084 c->flags |= REDIS_IO_WAIT;
9085 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9086 server.vm_blocked_clients++;
9087 return 1;
9088 } else {
9089 return 0;
9090 }
9091}
9092
9093/* Remove the 'key' from the list of blocked keys for a given client.
9094 *
9095 * The function returns 1 when there are no longer blocking keys after
9096 * the current one was removed (and the client can be unblocked). */
9097static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9098 list *l;
9099 listNode *ln;
9100 listIter li;
9101 struct dictEntry *de;
9102
9103 /* Remove the key from the list of keys this client is waiting for. */
9104 listRewind(c->io_keys,&li);
9105 while ((ln = listNext(&li)) != NULL) {
9106 if (compareStringObjects(ln->value,key) == 0) {
9107 listDelNode(c->io_keys,ln);
9108 break;
9109 }
9110 }
9111 assert(ln != NULL);
9112
9113 /* Remove the client form the key => waiting clients map. */
9114 de = dictFind(c->db->io_keys,key);
9115 assert(de != NULL);
9116 l = dictGetEntryVal(de);
9117 ln = listSearchKey(l,c);
9118 assert(ln != NULL);
9119 listDelNode(l,ln);
9120 if (listLength(l) == 0)
9121 dictDelete(c->db->io_keys,key);
9122
9123 return listLength(c->io_keys) == 0;
9124}
9125
9126static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9127 struct dictEntry *de;
9128 list *l;
9129 listNode *ln;
9130 int len;
9131
9132 de = dictFind(db->io_keys,key);
9133 if (!de) return;
9134
9135 l = dictGetEntryVal(de);
9136 len = listLength(l);
9137 /* Note: we can't use something like while(listLength(l)) as the list
9138 * can be freed by the calling function when we remove the last element. */
9139 while (len--) {
9140 ln = listFirst(l);
9141 redisClient *c = ln->value;
9142
9143 if (dontWaitForSwappedKey(c,key)) {
9144 /* Put the client in the list of clients ready to go as we
9145 * loaded all the keys about it. */
9146 listAddNodeTail(server.io_ready_clients,c);
9147 }
9148 }
b0d8747d 9149}
b0d8747d 9150
500ece7c 9151/* =========================== Remote Configuration ========================= */
9152
9153static void configSetCommand(redisClient *c) {
9154 robj *o = getDecodedObject(c->argv[3]);
9155 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9156 zfree(server.dbfilename);
9157 server.dbfilename = zstrdup(o->ptr);
9158 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9159 zfree(server.requirepass);
9160 server.requirepass = zstrdup(o->ptr);
9161 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9162 zfree(server.masterauth);
9163 server.masterauth = zstrdup(o->ptr);
9164 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9165 server.maxmemory = strtoll(o->ptr, NULL, 10);
9166 } else {
9167 addReplySds(c,sdscatprintf(sdsempty(),
9168 "-ERR not supported CONFIG parameter %s\r\n",
9169 (char*)c->argv[2]->ptr));
9170 decrRefCount(o);
9171 return;
9172 }
9173 decrRefCount(o);
9174 addReply(c,shared.ok);
9175}
9176
9177static void configGetCommand(redisClient *c) {
9178 robj *o = getDecodedObject(c->argv[2]);
9179 robj *lenobj = createObject(REDIS_STRING,NULL);
9180 char *pattern = o->ptr;
9181 int matches = 0;
9182
9183 addReply(c,lenobj);
9184 decrRefCount(lenobj);
9185
9186 if (stringmatch(pattern,"dbfilename",0)) {
9187 addReplyBulkCString(c,"dbfilename");
9188 addReplyBulkCString(c,server.dbfilename);
9189 matches++;
9190 }
9191 if (stringmatch(pattern,"requirepass",0)) {
9192 addReplyBulkCString(c,"requirepass");
9193 addReplyBulkCString(c,server.requirepass);
9194 matches++;
9195 }
9196 if (stringmatch(pattern,"masterauth",0)) {
9197 addReplyBulkCString(c,"masterauth");
9198 addReplyBulkCString(c,server.masterauth);
9199 matches++;
9200 }
9201 if (stringmatch(pattern,"maxmemory",0)) {
9202 char buf[128];
9203
9204 snprintf(buf,128,"%llu\n",server.maxmemory);
9205 addReplyBulkCString(c,"maxmemory");
9206 addReplyBulkCString(c,buf);
9207 matches++;
9208 }
9209 decrRefCount(o);
9210 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9211}
9212
9213static void configCommand(redisClient *c) {
9214 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9215 if (c->argc != 4) goto badarity;
9216 configSetCommand(c);
9217 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9218 if (c->argc != 3) goto badarity;
9219 configGetCommand(c);
9220 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9221 if (c->argc != 2) goto badarity;
9222 server.stat_numcommands = 0;
9223 server.stat_numconnections = 0;
9224 server.stat_expiredkeys = 0;
9225 server.stat_starttime = time(NULL);
9226 addReply(c,shared.ok);
9227 } else {
9228 addReplySds(c,sdscatprintf(sdsempty(),
9229 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9230 }
9231 return;
9232
9233badarity:
9234 addReplySds(c,sdscatprintf(sdsempty(),
9235 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9236 (char*) c->argv[1]->ptr));
9237}
9238
7f957c92 9239/* ================================= Debugging ============================== */
9240
9241static void debugCommand(redisClient *c) {
9242 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9243 *((char*)-1) = 'x';
210e29f7 9244 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9245 if (rdbSave(server.dbfilename) != REDIS_OK) {
9246 addReply(c,shared.err);
9247 return;
9248 }
9249 emptyDb();
9250 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9251 addReply(c,shared.err);
9252 return;
9253 }
9254 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9255 addReply(c,shared.ok);
71c2b467 9256 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9257 emptyDb();
9258 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9259 addReply(c,shared.err);
9260 return;
9261 }
9262 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9263 addReply(c,shared.ok);
333298da 9264 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9265 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9266 robj *key, *val;
9267
9268 if (!de) {
9269 addReply(c,shared.nokeyerr);
9270 return;
9271 }
9272 key = dictGetEntryKey(de);
9273 val = dictGetEntryVal(de);
59146ef3 9274 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9275 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 9276 char *strenc;
9277 char buf[128];
9278
9279 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9280 strenc = strencoding[val->encoding];
9281 } else {
9282 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9283 strenc = buf;
9284 }
ace06542 9285 addReplySds(c,sdscatprintf(sdsempty(),
9286 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 9287 "encoding:%s serializedlength:%lld\r\n",
682ac724 9288 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 9289 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 9290 } else {
9291 addReplySds(c,sdscatprintf(sdsempty(),
9292 "+Key at:%p refcount:%d, value swapped at: page %llu "
9293 "using %llu pages\r\n",
9294 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9295 (unsigned long long) key->vm.usedpages));
9296 }
7d30035d 9297 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9298 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9299 robj *key, *val;
9300
9301 if (!server.vm_enabled) {
9302 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9303 return;
9304 }
9305 if (!de) {
9306 addReply(c,shared.nokeyerr);
9307 return;
9308 }
9309 key = dictGetEntryKey(de);
9310 val = dictGetEntryVal(de);
4ef8de8a 9311 /* If the key is shared we want to create a copy */
9312 if (key->refcount > 1) {
9313 robj *newkey = dupStringObject(key);
9314 decrRefCount(key);
9315 key = dictGetEntryKey(de) = newkey;
9316 }
9317 /* Swap it */
7d30035d 9318 if (key->storage != REDIS_VM_MEMORY) {
9319 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 9320 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 9321 dictGetEntryVal(de) = NULL;
9322 addReply(c,shared.ok);
9323 } else {
9324 addReply(c,shared.err);
9325 }
7f957c92 9326 } else {
333298da 9327 addReplySds(c,sdsnew(
7d30035d 9328 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 9329 }
9330}
56906eef 9331
6c96ba7d 9332static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 9333 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 9334 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 9335#ifdef HAVE_BACKTRACE
9336 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9337 *((char*)-1) = 'x';
9338#endif
9339}
9340
bcfc686d 9341/* =================================== Main! ================================ */
56906eef 9342
bcfc686d 9343#ifdef __linux__
9344int linuxOvercommitMemoryValue(void) {
9345 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9346 char buf[64];
56906eef 9347
bcfc686d 9348 if (!fp) return -1;
9349 if (fgets(buf,64,fp) == NULL) {
9350 fclose(fp);
9351 return -1;
9352 }
9353 fclose(fp);
56906eef 9354
bcfc686d 9355 return atoi(buf);
9356}
9357
9358void linuxOvercommitMemoryWarning(void) {
9359 if (linuxOvercommitMemoryValue() == 0) {
9360 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9361 }
9362}
9363#endif /* __linux__ */
9364
9365static void daemonize(void) {
9366 int fd;
9367 FILE *fp;
9368
9369 if (fork() != 0) exit(0); /* parent exits */
9370 setsid(); /* create a new session */
9371
9372 /* Every output goes to /dev/null. If Redis is daemonized but
9373 * the 'logfile' is set to 'stdout' in the configuration file
9374 * it will not log at all. */
9375 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9376 dup2(fd, STDIN_FILENO);
9377 dup2(fd, STDOUT_FILENO);
9378 dup2(fd, STDERR_FILENO);
9379 if (fd > STDERR_FILENO) close(fd);
9380 }
9381 /* Try to write the pid file */
9382 fp = fopen(server.pidfile,"w");
9383 if (fp) {
9384 fprintf(fp,"%d\n",getpid());
9385 fclose(fp);
56906eef 9386 }
56906eef 9387}
9388
42ab0172
AO
9389static void version() {
9390 printf("Redis server version %s\n", REDIS_VERSION);
9391 exit(0);
9392}
9393
723fb69b
AO
9394static void usage() {
9395 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 9396 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
9397 exit(1);
9398}
9399
bcfc686d 9400int main(int argc, char **argv) {
9651a787 9401 time_t start;
9402
bcfc686d 9403 initServerConfig();
9404 if (argc == 2) {
44efe66e 9405 if (strcmp(argv[1], "-v") == 0 ||
9406 strcmp(argv[1], "--version") == 0) version();
9407 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 9408 resetServerSaveParams();
9409 loadServerConfig(argv[1]);
723fb69b
AO
9410 } else if ((argc > 2)) {
9411 usage();
bcfc686d 9412 } else {
9413 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9414 }
bcfc686d 9415 if (server.daemonize) daemonize();
71c54b21 9416 initServer();
bcfc686d 9417 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9418#ifdef __linux__
9419 linuxOvercommitMemoryWarning();
9420#endif
9651a787 9421 start = time(NULL);
bcfc686d 9422 if (server.appendonly) {
9423 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 9424 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 9425 } else {
9426 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 9427 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 9428 }
bcfc686d 9429 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 9430 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 9431 aeMain(server.el);
9432 aeDeleteEventLoop(server.el);
9433 return 0;
9434}
9435
9436/* ============================= Backtrace support ========================= */
9437
9438#ifdef HAVE_BACKTRACE
9439static char *findFuncName(void *pointer, unsigned long *offset);
9440
56906eef 9441static void *getMcontextEip(ucontext_t *uc) {
9442#if defined(__FreeBSD__)
9443 return (void*) uc->uc_mcontext.mc_eip;
9444#elif defined(__dietlibc__)
9445 return (void*) uc->uc_mcontext.eip;
06db1f50 9446#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 9447 #if __x86_64__
9448 return (void*) uc->uc_mcontext->__ss.__rip;
9449 #else
56906eef 9450 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 9451 #endif
06db1f50 9452#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 9453 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 9454 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 9455 #else
9456 return (void*) uc->uc_mcontext->__ss.__eip;
9457 #endif
54bac49d 9458#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 9459 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 9460#elif defined(__ia64__) /* Linux IA64 */
9461 return (void*) uc->uc_mcontext.sc_ip;
9462#else
9463 return NULL;
56906eef 9464#endif
9465}
9466
9467static void segvHandler(int sig, siginfo_t *info, void *secret) {
9468 void *trace[100];
9469 char **messages = NULL;
9470 int i, trace_size = 0;
9471 unsigned long offset=0;
56906eef 9472 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 9473 sds infostring;
56906eef 9474 REDIS_NOTUSED(info);
9475
9476 redisLog(REDIS_WARNING,
9477 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 9478 infostring = genRedisInfoString();
9479 redisLog(REDIS_WARNING, "%s",infostring);
9480 /* It's not safe to sdsfree() the returned string under memory
9481 * corruption conditions. Let it leak as we are going to abort */
56906eef 9482
9483 trace_size = backtrace(trace, 100);
de96dbfe 9484 /* overwrite sigaction with caller's address */
b91cf5ef 9485 if (getMcontextEip(uc) != NULL) {
9486 trace[1] = getMcontextEip(uc);
9487 }
56906eef 9488 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 9489
d76412d1 9490 for (i=1; i<trace_size; ++i) {
56906eef 9491 char *fn = findFuncName(trace[i], &offset), *p;
9492
9493 p = strchr(messages[i],'+');
9494 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9495 redisLog(REDIS_WARNING,"%s", messages[i]);
9496 } else {
9497 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9498 }
9499 }
b177fd30 9500 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 9501 _exit(0);
fe3bbfbe 9502}
56906eef 9503
9504static void setupSigSegvAction(void) {
9505 struct sigaction act;
9506
9507 sigemptyset (&act.sa_mask);
9508 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9509 * is used. Otherwise, sa_handler is used */
9510 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9511 act.sa_sigaction = segvHandler;
9512 sigaction (SIGSEGV, &act, NULL);
9513 sigaction (SIGBUS, &act, NULL);
12fea928 9514 sigaction (SIGFPE, &act, NULL);
9515 sigaction (SIGILL, &act, NULL);
9516 sigaction (SIGBUS, &act, NULL);
e65fdc78 9517 return;
56906eef 9518}
e65fdc78 9519
bcfc686d 9520#include "staticsymbols.h"
9521/* This function try to convert a pointer into a function name. It's used in
9522 * oreder to provide a backtrace under segmentation fault that's able to
9523 * display functions declared as static (otherwise the backtrace is useless). */
9524static char *findFuncName(void *pointer, unsigned long *offset){
9525 int i, ret = -1;
9526 unsigned long off, minoff = 0;
ed9b544e 9527
bcfc686d 9528 /* Try to match against the Symbol with the smallest offset */
9529 for (i=0; symsTable[i].pointer; i++) {
9530 unsigned long lp = (unsigned long) pointer;
0bc03378 9531
bcfc686d 9532 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9533 off=lp-symsTable[i].pointer;
9534 if (ret < 0 || off < minoff) {
9535 minoff=off;
9536 ret=i;
9537 }
9538 }
0bc03378 9539 }
bcfc686d 9540 if (ret == -1) return NULL;
9541 *offset = minoff;
9542 return symsTable[ret].name;
0bc03378 9543}
bcfc686d 9544#else /* HAVE_BACKTRACE */
9545static void setupSigSegvAction(void) {
0bc03378 9546}
bcfc686d 9547#endif /* HAVE_BACKTRACE */
0bc03378 9548
ed9b544e 9549
ed9b544e 9550
bcfc686d 9551/* The End */
9552
9553
ed9b544e 9554