]> git.saurik.com Git - redis.git/blame - redis.c
Merge branch 'expire' of git://github.com/pietern/redis
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
c7d9d662 78#include "ziplist.h" /* Compact list data structure */
ba798261 79#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 80#include "release.h" /* Release and/or git repository information */
ed9b544e 81
82/* Error codes */
83#define REDIS_OK 0
84#define REDIS_ERR -1
85
86/* Static server configuration */
87#define REDIS_SERVERPORT 6379 /* TCP port */
88#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 89#define REDIS_IOBUF_LEN 1024
ed9b544e 90#define REDIS_LOADBUF_LEN 1024
248ea310 91#define REDIS_STATIC_ARGS 8
ed9b544e 92#define REDIS_DEFAULT_DBNUM 16
93#define REDIS_CONFIGLINE_MAX 1024
94#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
95#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 96#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 97#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 98#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
99
100/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
101#define REDIS_WRITEV_THRESHOLD 3
102/* Max number of iovecs used for each writev call */
103#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 104
105/* Hash table parameters */
106#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 107
108/* Command flags */
3fd78bcd 109#define REDIS_CMD_BULK 1 /* Bulk write command */
110#define REDIS_CMD_INLINE 2 /* Inline command */
111/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
112 this flags will return an error when the 'maxmemory' option is set in the
113 config file and the server is using more than maxmemory bytes of memory.
114 In short this commands are denied on low memory conditions. */
115#define REDIS_CMD_DENYOOM 4
4005fef1 116#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 117
118/* Object types */
119#define REDIS_STRING 0
120#define REDIS_LIST 1
121#define REDIS_SET 2
1812e024 122#define REDIS_ZSET 3
123#define REDIS_HASH 4
560db612 124#define REDIS_VMPOINTER 8
f78fd11b 125
5234952b 126/* Objects encoding. Some kind of objects like Strings and Hashes can be
127 * internally represented in multiple ways. The 'encoding' field of the object
128 * is set to one of this fields for this object. */
c7d9d662
PN
129#define REDIS_ENCODING_RAW 0 /* Raw representation */
130#define REDIS_ENCODING_INT 1 /* Encoded as integer */
131#define REDIS_ENCODING_HT 2 /* Encoded as hash table */
132#define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
133#define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
134#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
942a3961 135
07efaf74 136static char* strencoding[] = {
846d8b3e 137 "raw", "int", "hashtable", "zipmap", "list", "ziplist"
07efaf74 138};
139
f78fd11b 140/* Object types only used for dumping to disk */
bb32ede5 141#define REDIS_EXPIRETIME 253
ed9b544e 142#define REDIS_SELECTDB 254
143#define REDIS_EOF 255
144
f78fd11b 145/* Defines related to the dump file format. To store 32 bits lengths for short
146 * keys requires a lot of space, so we check the most significant 2 bits of
147 * the first byte to interpreter the length:
148 *
149 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
150 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
151 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 152 * 11|000000 this means: specially encoded object will follow. The six bits
153 * number specify the kind of object that follows.
154 * See the REDIS_RDB_ENC_* defines.
f78fd11b 155 *
10c43610 156 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
157 * values, will fit inside. */
f78fd11b 158#define REDIS_RDB_6BITLEN 0
159#define REDIS_RDB_14BITLEN 1
160#define REDIS_RDB_32BITLEN 2
17be1a4a 161#define REDIS_RDB_ENCVAL 3
f78fd11b 162#define REDIS_RDB_LENERR UINT_MAX
163
a4d1ba9a 164/* When a length of a string object stored on disk has the first two bits
165 * set, the remaining two bits specify a special encoding for the object
166 * accordingly to the following defines: */
167#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
168#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
169#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 170#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 171
75680a3c 172/* Virtual memory object->where field. */
173#define REDIS_VM_MEMORY 0 /* The object is on memory */
174#define REDIS_VM_SWAPPED 1 /* The object is on disk */
175#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
176#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
177
06224fec 178/* Virtual memory static configuration stuff.
179 * Check vmFindContiguousPages() to know more about this magic numbers. */
180#define REDIS_VM_MAX_NEAR_PAGES 65536
181#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 182#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 183#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 184/* The following is the *percentage* of completed I/O jobs to process when the
185 * handelr is called. While Virtual Memory I/O operations are performed by
186 * threads, this operations must be processed by the main thread when completed
187 * in order to take effect. */
c953f24b 188#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 189
ed9b544e 190/* Client flags */
d5d55fc3 191#define REDIS_SLAVE 1 /* This client is a slave server */
192#define REDIS_MASTER 2 /* This client is a master server */
193#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
194#define REDIS_MULTI 8 /* This client is in a MULTI context */
195#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
196#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 197#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 198
40d224a9 199/* Slave replication state - slave side */
ed9b544e 200#define REDIS_REPL_NONE 0 /* No active replication */
201#define REDIS_REPL_CONNECT 1 /* Must connect to master */
202#define REDIS_REPL_CONNECTED 2 /* Connected to master */
203
40d224a9 204/* Slave replication state - from the point of view of master
205 * Note that in SEND_BULK and ONLINE state the slave receives new updates
206 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
207 * to start the next background saving in order to send updates to it. */
208#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
209#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
210#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
211#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
212
ed9b544e 213/* List related stuff */
214#define REDIS_HEAD 0
215#define REDIS_TAIL 1
216
217/* Sort operations */
218#define REDIS_SORT_GET 0
443c6409 219#define REDIS_SORT_ASC 1
220#define REDIS_SORT_DESC 2
ed9b544e 221#define REDIS_SORTKEY_MAX 1024
222
223/* Log levels */
224#define REDIS_DEBUG 0
f870935d 225#define REDIS_VERBOSE 1
226#define REDIS_NOTICE 2
227#define REDIS_WARNING 3
ed9b544e 228
229/* Anti-warning macro... */
230#define REDIS_NOTUSED(V) ((void) V)
231
6b47e12e 232#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
233#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 234
48f0308a 235/* Append only defines */
236#define APPENDFSYNC_NO 0
237#define APPENDFSYNC_ALWAYS 1
238#define APPENDFSYNC_EVERYSEC 2
239
d0686e07 240/* Zip structure related defaults */
cbba7dd7 241#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
242#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
d0686e07
PN
243#define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
244#define REDIS_LIST_MAX_ZIPLIST_VALUE 32
cbba7dd7 245
dfc5e96c 246/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 247#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 248#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 249static void _redisAssert(char *estr, char *file, int line);
c651fd9e 250static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 251
ed9b544e 252/*================================= Data types ============================== */
253
254/* A redis object, that is a type able to hold a string / list / set */
75680a3c 255
75680a3c 256/* The actual Redis Object */
ed9b544e 257typedef struct redisObject {
560db612 258 unsigned type:4;
259 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
260 unsigned encoding:4;
261 unsigned lru:22; /* lru time (relative to server.lruclock) */
ed9b544e 262 int refcount;
560db612 263 void *ptr;
dedff272 264 /* VM fields are only allocated if VM is active, otherwise the
75680a3c 265 * object allocation function will just allocate
266 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
267 * Redis without VM active will not have any overhead. */
ed9b544e 268} robj;
269
560db612 270/* The VM pointer structure - identifies an object in the swap file.
271 *
272 * This object is stored in place of the value
273 * object in the main key->value hash table representing a database.
274 * Note that the first fields (type, storage) are the same as the redisObject
275 * structure so that vmPointer strucuters can be accessed even when casted
276 * as redisObject structures.
277 *
278 * This is useful as we don't know if a value object is or not on disk, but we
169dd6b7 279 * are always able to read obj->storage to check this. For vmPointer
560db612 280 * structures "type" is set to REDIS_VMPOINTER (even if without this field
281 * is still possible to check the kind of object from the value of 'storage').*/
282typedef struct vmPointer {
283 unsigned type:4;
284 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
285 unsigned notused:26;
286 unsigned int vtype; /* type of the object stored in the swap file */
287 off_t page; /* the page at witch the object is stored on disk */
288 off_t usedpages; /* number of pages used on disk */
289} vmpointer;
290
dfc5e96c 291/* Macro used to initalize a Redis object allocated on the stack.
292 * Note that this macro is taken near the structure definition to make sure
293 * we'll update it when the structure is changed, to avoid bugs like
294 * bug #85 introduced exactly in this way. */
295#define initStaticStringObject(_var,_ptr) do { \
296 _var.refcount = 1; \
297 _var.type = REDIS_STRING; \
298 _var.encoding = REDIS_ENCODING_RAW; \
299 _var.ptr = _ptr; \
560db612 300 _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 301} while(0);
302
3305306f 303typedef struct redisDb {
4409877e 304 dict *dict; /* The keyspace for this DB */
305 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 306 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 307 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 308 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 309 int id;
310} redisDb;
311
6e469882 312/* Client MULTI/EXEC state */
313typedef struct multiCmd {
314 robj **argv;
315 int argc;
316 struct redisCommand *cmd;
317} multiCmd;
318
319typedef struct multiState {
320 multiCmd *commands; /* Array of MULTI commands */
321 int count; /* Total number of MULTI commands */
322} multiState;
323
ed9b544e 324/* With multiplexing we need to take per-clinet state.
325 * Clients are taken in a liked list. */
326typedef struct redisClient {
327 int fd;
3305306f 328 redisDb *db;
ed9b544e 329 int dictid;
330 sds querybuf;
e8a74421 331 robj **argv, **mbargv;
332 int argc, mbargc;
40d224a9 333 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 334 int multibulk; /* multi bulk command format active */
ed9b544e 335 list *reply;
336 int sentlen;
337 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 338 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 339 int slaveseldb; /* slave selected db, if this client is a slave */
340 int authenticated; /* when requirepass is non-NULL */
341 int replstate; /* replication state if this is a slave */
342 int repldbfd; /* replication DB file descriptor */
6e469882 343 long repldboff; /* replication DB file offset */
40d224a9 344 off_t repldbsize; /* replication DB file size */
6e469882 345 multiState mstate; /* MULTI/EXEC state */
37ab76c9 346 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 347 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 348 int blocking_keys_num; /* Number of blocking keys */
4409877e 349 time_t blockingto; /* Blocking operation timeout. If UNIX current time
350 * is >= blockingto then the operation timed out. */
92f8e882 351 list *io_keys; /* Keys this client is waiting to be loaded from the
352 * swap file in order to continue. */
37ab76c9 353 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 354 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
355 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 356} redisClient;
357
358struct saveparam {
359 time_t seconds;
360 int changes;
361};
362
363/* Global server state structure */
364struct redisServer {
365 int port;
366 int fd;
3305306f 367 redisDb *db;
ed9b544e 368 long long dirty; /* changes to DB from the last save */
369 list *clients;
87eca727 370 list *slaves, *monitors;
ed9b544e 371 char neterr[ANET_ERR_LEN];
372 aeEventLoop *el;
373 int cronloops; /* number of times the cron function run */
374 list *objfreelist; /* A list of freed objects to avoid malloc() */
375 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 376 /* Fields used only for stats */
377 time_t stat_starttime; /* server start time */
378 long long stat_numcommands; /* number of processed commands */
379 long long stat_numconnections; /* number of connections received */
2a6a2ed1 380 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 381 /* Configuration */
382 int verbosity;
383 int glueoutputbuf;
384 int maxidletime;
385 int dbnum;
386 int daemonize;
44b38ef4 387 int appendonly;
48f0308a 388 int appendfsync;
38db9171 389 int no_appendfsync_on_rewrite;
fab43727 390 int shutdown_asap;
48f0308a 391 time_t lastfsync;
44b38ef4 392 int appendfd;
393 int appendseldb;
ed329fcf 394 char *pidfile;
9f3c422c 395 pid_t bgsavechildpid;
9d65a1bb 396 pid_t bgrewritechildpid;
397 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 398 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 399 struct saveparam *saveparams;
400 int saveparamslen;
401 char *logfile;
402 char *bindaddr;
403 char *dbfilename;
44b38ef4 404 char *appendfilename;
abcb223e 405 char *requirepass;
121f70cf 406 int rdbcompression;
8ca3e9d1 407 int activerehashing;
ed9b544e 408 /* Replication related */
409 int isslave;
d0ccebcf 410 char *masterauth;
ed9b544e 411 char *masterhost;
412 int masterport;
40d224a9 413 redisClient *master; /* client that is master for this slave */
ed9b544e 414 int replstate;
285add55 415 unsigned int maxclients;
4ef8de8a 416 unsigned long long maxmemory;
d5d55fc3 417 unsigned int blpop_blocked_clients;
418 unsigned int vm_blocked_clients;
ed9b544e 419 /* Sort parameters - qsort_r() is only available under BSD so we
420 * have to take this state global, in order to pass it to sortCompare() */
421 int sort_desc;
422 int sort_alpha;
423 int sort_bypattern;
75680a3c 424 /* Virtual memory configuration */
425 int vm_enabled;
054e426d 426 char *vm_swap_file;
75680a3c 427 off_t vm_page_size;
428 off_t vm_pages;
4ef8de8a 429 unsigned long long vm_max_memory;
d0686e07 430 /* Zip structure config */
cbba7dd7 431 size_t hash_max_zipmap_entries;
432 size_t hash_max_zipmap_value;
d0686e07
PN
433 size_t list_max_ziplist_entries;
434 size_t list_max_ziplist_value;
75680a3c 435 /* Virtual memory state */
436 FILE *vm_fp;
437 int vm_fd;
438 off_t vm_next_page; /* Next probably empty page */
439 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 440 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 441 time_t unixtime; /* Unix time sampled every second. */
92f8e882 442 /* Virtual memory I/O threads stuff */
92f8e882 443 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 444 * put the result of the operation in the io_done list. While the
445 * job is being processed, it's put on io_processing queue. */
446 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
447 list *io_processing; /* List of VM I/O jobs being processed */
448 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 449 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 450 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 451 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
452 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 453 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 454 int io_active_threads; /* Number of running I/O threads */
455 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 456 /* Our main thread is blocked on the event loop, locking for sockets ready
457 * to be read or written, so when a threaded I/O operation is ready to be
458 * processed by the main thread, the I/O thread will use a unix pipe to
459 * awake the main thread. The followings are the two pipe FDs. */
460 int io_ready_pipe_read;
461 int io_ready_pipe_write;
7d98e08c 462 /* Virtual memory stats */
463 unsigned long long vm_stats_used_pages;
464 unsigned long long vm_stats_swapped_objects;
465 unsigned long long vm_stats_swapouts;
466 unsigned long long vm_stats_swapins;
befec3cd 467 /* Pubsub */
ffc6b7f8 468 dict *pubsub_channels; /* Map channels to list of subscribed clients */
469 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 470 /* Misc */
b9bc0eef 471 FILE *devnull;
560db612 472 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
473 unsigned lruclock_padding:10;
ed9b544e 474};
475
ffc6b7f8 476typedef struct pubsubPattern {
477 redisClient *client;
478 robj *pattern;
479} pubsubPattern;
480
ed9b544e 481typedef void redisCommandProc(redisClient *c);
ca1788b5 482typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 483struct redisCommand {
484 char *name;
485 redisCommandProc *proc;
486 int arity;
487 int flags;
76583ea4
PN
488 /* Use a function to determine which keys need to be loaded
489 * in the background prior to executing this command. Takes precedence
490 * over vm_firstkey and others, ignored when NULL */
ca1788b5 491 redisVmPreloadProc *vm_preload_proc;
7c775e09 492 /* What keys should be loaded in background when calling this command? */
493 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
494 int vm_lastkey; /* THe last argument that's a key */
495 int vm_keystep; /* The step between first and last key */
ed9b544e 496};
497
de96dbfe 498struct redisFunctionSym {
499 char *name;
56906eef 500 unsigned long pointer;
de96dbfe 501};
502
ed9b544e 503typedef struct _redisSortObject {
504 robj *obj;
505 union {
506 double score;
507 robj *cmpobj;
508 } u;
509} redisSortObject;
510
511typedef struct _redisSortOperation {
512 int type;
513 robj *pattern;
514} redisSortOperation;
515
6b47e12e 516/* ZSETs use a specialized version of Skiplists */
517
518typedef struct zskiplistNode {
519 struct zskiplistNode **forward;
e3870fab 520 struct zskiplistNode *backward;
912b9165 521 unsigned int *span;
6b47e12e 522 double score;
523 robj *obj;
524} zskiplistNode;
525
526typedef struct zskiplist {
e3870fab 527 struct zskiplistNode *header, *tail;
d13f767c 528 unsigned long length;
6b47e12e 529 int level;
530} zskiplist;
531
1812e024 532typedef struct zset {
533 dict *dict;
6b47e12e 534 zskiplist *zsl;
1812e024 535} zset;
536
6b47e12e 537/* Our shared "common" objects */
538
05df7621 539#define REDIS_SHARED_INTEGERS 10000
ed9b544e 540struct sharedObjectsStruct {
23d3a5fe 541 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *cnegone, *pong, *space,
6e469882 542 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 543 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
544 *outofrangeerr, *plus,
ed9b544e 545 *select0, *select1, *select2, *select3, *select4,
befec3cd 546 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 547 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
548 *mbulk4, *psubscribebulk, *punsubscribebulk,
549 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 550} shared;
551
a7866db6 552/* Global vars that are actally used as constants. The following double
553 * values are used for double on-disk serialization, and are initialized
554 * at runtime to avoid strange compiler optimizations. */
555
556static double R_Zero, R_PosInf, R_NegInf, R_Nan;
557
92f8e882 558/* VM threaded I/O request message */
b9bc0eef 559#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
560#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
561#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 562typedef struct iojob {
996cb5f7 563 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 564 redisDb *db;/* Redis database */
92f8e882 565 robj *key; /* This I/O request is about swapping this key */
560db612 566 robj *id; /* Unique identifier of this job:
567 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
568 vmpointer objct for REDIS_IOREQ_LOAD. */
b9bc0eef 569 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 570 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
571 off_t page; /* Swap page where to read/write the object */
248ea310 572 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 573 int canceled; /* True if this command was canceled by blocking side of VM */
574 pthread_t thread; /* ID of the thread processing this entry */
575} iojob;
92f8e882 576
ed9b544e 577/*================================ Prototypes =============================== */
578
579static void freeStringObject(robj *o);
580static void freeListObject(robj *o);
581static void freeSetObject(robj *o);
582static void decrRefCount(void *o);
583static robj *createObject(int type, void *ptr);
584static void freeClient(redisClient *c);
f78fd11b 585static int rdbLoad(char *filename);
ed9b544e 586static void addReply(redisClient *c, robj *obj);
587static void addReplySds(redisClient *c, sds s);
588static void incrRefCount(robj *o);
f78fd11b 589static int rdbSaveBackground(char *filename);
ed9b544e 590static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 591static robj *dupStringObject(robj *o);
248ea310 592static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 593static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 594static void flushAppendOnlyFile(void);
44b38ef4 595static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 596static int syncWithMaster(void);
05df7621 597static robj *tryObjectEncoding(robj *o);
9d65a1bb 598static robj *getDecodedObject(robj *o);
3305306f 599static int removeExpire(redisDb *db, robj *key);
600static int expireIfNeeded(redisDb *db, robj *key);
601static int deleteIfVolatile(redisDb *db, robj *key);
09241813 602static int dbDelete(redisDb *db, robj *key);
bb32ede5 603static time_t getExpire(redisDb *db, robj *key);
604static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 605static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 606static void freeMemoryIfNeeded(void);
de96dbfe 607static int processCommand(redisClient *c);
56906eef 608static void setupSigSegvAction(void);
a3b21203 609static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 610static void aofRemoveTempFile(pid_t childpid);
0ea663ea 611static size_t stringObjectLen(robj *o);
638e42ac 612static void processInputBuffer(redisClient *c);
6b47e12e 613static zskiplist *zslCreate(void);
fd8ccf44 614static void zslFree(zskiplist *zsl);
2b59cfdf 615static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 616static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 617static void initClientMultiState(redisClient *c);
618static void freeClientMultiState(redisClient *c);
619static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 620static void unblockClientWaitingData(redisClient *c);
4409877e 621static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 622static void vmInit(void);
a35ddf12 623static void vmMarkPagesFree(off_t page, off_t count);
560db612 624static robj *vmLoadObject(robj *o);
625static robj *vmPreviewObject(robj *o);
a69a0c9c 626static int vmSwapOneObjectBlocking(void);
627static int vmSwapOneObjectThreaded(void);
7e69548d 628static int vmCanSwapOut(void);
a5819310 629static int tryFreeOneObjectFromFreelist(void);
996cb5f7 630static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
631static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
632static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 633static void lockThreadedIO(void);
634static void unlockThreadedIO(void);
635static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
636static void freeIOJob(iojob *j);
637static void queueIOJob(iojob *j);
a5819310 638static int vmWriteObjectOnSwap(robj *o, off_t page);
639static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 640static void waitEmptyIOJobsQueue(void);
641static void vmReopenSwapFile(void);
970e10bb 642static int vmFreePage(off_t page);
ca1788b5 643static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 644static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 645static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 646static int dontWaitForSwappedKey(redisClient *c, robj *key);
647static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
648static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
649static struct redisCommand *lookupCommand(char *name);
650static void call(redisClient *c, struct redisCommand *cmd);
651static void resetClient(redisClient *c);
ada386b2 652static void convertToRealHash(robj *o);
003f0840 653static void listTypeConvert(robj *o, int enc);
ffc6b7f8 654static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
655static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
656static void freePubsubPattern(void *p);
657static int listMatchPubsubPattern(void *a, void *b);
658static int compareStringObjects(robj *a, robj *b);
bf028098 659static int equalStringObjects(robj *a, robj *b);
befec3cd 660static void usage();
8f63ddca 661static int rewriteAppendOnlyFileBackground(void);
560db612 662static vmpointer *vmSwapObjectBlocking(robj *val);
fab43727 663static int prepareForShutdown();
37ab76c9 664static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 665static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 666static void unwatchAllKeys(redisClient *c);
ed9b544e 667
abcb223e 668static void authCommand(redisClient *c);
ed9b544e 669static void pingCommand(redisClient *c);
670static void echoCommand(redisClient *c);
671static void setCommand(redisClient *c);
672static void setnxCommand(redisClient *c);
526d00a5 673static void setexCommand(redisClient *c);
ed9b544e 674static void getCommand(redisClient *c);
675static void delCommand(redisClient *c);
676static void existsCommand(redisClient *c);
677static void incrCommand(redisClient *c);
678static void decrCommand(redisClient *c);
679static void incrbyCommand(redisClient *c);
680static void decrbyCommand(redisClient *c);
681static void selectCommand(redisClient *c);
682static void randomkeyCommand(redisClient *c);
683static void keysCommand(redisClient *c);
684static void dbsizeCommand(redisClient *c);
685static void lastsaveCommand(redisClient *c);
686static void saveCommand(redisClient *c);
687static void bgsaveCommand(redisClient *c);
9d65a1bb 688static void bgrewriteaofCommand(redisClient *c);
ed9b544e 689static void shutdownCommand(redisClient *c);
690static void moveCommand(redisClient *c);
691static void renameCommand(redisClient *c);
692static void renamenxCommand(redisClient *c);
693static void lpushCommand(redisClient *c);
694static void rpushCommand(redisClient *c);
dedff272
RP
695static void lpushxCommand(redisClient *c);
696static void rpushxCommand(redisClient *c);
697static void linsertCommand(redisClient *c);
ed9b544e 698static void lpopCommand(redisClient *c);
699static void rpopCommand(redisClient *c);
700static void llenCommand(redisClient *c);
701static void lindexCommand(redisClient *c);
702static void lrangeCommand(redisClient *c);
703static void ltrimCommand(redisClient *c);
704static void typeCommand(redisClient *c);
705static void lsetCommand(redisClient *c);
706static void saddCommand(redisClient *c);
707static void sremCommand(redisClient *c);
a4460ef4 708static void smoveCommand(redisClient *c);
ed9b544e 709static void sismemberCommand(redisClient *c);
710static void scardCommand(redisClient *c);
12fea928 711static void spopCommand(redisClient *c);
2abb95a9 712static void srandmemberCommand(redisClient *c);
ed9b544e 713static void sinterCommand(redisClient *c);
714static void sinterstoreCommand(redisClient *c);
40d224a9 715static void sunionCommand(redisClient *c);
716static void sunionstoreCommand(redisClient *c);
f4f56e1d 717static void sdiffCommand(redisClient *c);
718static void sdiffstoreCommand(redisClient *c);
ed9b544e 719static void syncCommand(redisClient *c);
720static void flushdbCommand(redisClient *c);
721static void flushallCommand(redisClient *c);
722static void sortCommand(redisClient *c);
723static void lremCommand(redisClient *c);
0f5f7e9a 724static void rpoplpushcommand(redisClient *c);
ed9b544e 725static void infoCommand(redisClient *c);
70003d28 726static void mgetCommand(redisClient *c);
87eca727 727static void monitorCommand(redisClient *c);
3305306f 728static void expireCommand(redisClient *c);
802e8373 729static void expireatCommand(redisClient *c);
f6b141c5 730static void getsetCommand(redisClient *c);
fd88489a 731static void ttlCommand(redisClient *c);
321b0e13 732static void slaveofCommand(redisClient *c);
7f957c92 733static void debugCommand(redisClient *c);
f6b141c5 734static void msetCommand(redisClient *c);
735static void msetnxCommand(redisClient *c);
fd8ccf44 736static void zaddCommand(redisClient *c);
7db723ad 737static void zincrbyCommand(redisClient *c);
cc812361 738static void zrangeCommand(redisClient *c);
50c55df5 739static void zrangebyscoreCommand(redisClient *c);
f44dd428 740static void zcountCommand(redisClient *c);
e3870fab 741static void zrevrangeCommand(redisClient *c);
3c41331e 742static void zcardCommand(redisClient *c);
1b7106e7 743static void zremCommand(redisClient *c);
6e333bbe 744static void zscoreCommand(redisClient *c);
1807985b 745static void zremrangebyscoreCommand(redisClient *c);
6e469882 746static void multiCommand(redisClient *c);
747static void execCommand(redisClient *c);
18b6cb76 748static void discardCommand(redisClient *c);
4409877e 749static void blpopCommand(redisClient *c);
750static void brpopCommand(redisClient *c);
4b00bebd 751static void appendCommand(redisClient *c);
39191553 752static void substrCommand(redisClient *c);
69d95c3e 753static void zrankCommand(redisClient *c);
798d9e55 754static void zrevrankCommand(redisClient *c);
978c2c94 755static void hsetCommand(redisClient *c);
1f1c7695 756static void hsetnxCommand(redisClient *c);
978c2c94 757static void hgetCommand(redisClient *c);
09aeb579
PN
758static void hmsetCommand(redisClient *c);
759static void hmgetCommand(redisClient *c);
07efaf74 760static void hdelCommand(redisClient *c);
92b27fe9 761static void hlenCommand(redisClient *c);
9212eafd 762static void zremrangebyrankCommand(redisClient *c);
5d373da9 763static void zunionstoreCommand(redisClient *c);
764static void zinterstoreCommand(redisClient *c);
78409a0f 765static void hkeysCommand(redisClient *c);
766static void hvalsCommand(redisClient *c);
767static void hgetallCommand(redisClient *c);
a86f14b1 768static void hexistsCommand(redisClient *c);
500ece7c 769static void configCommand(redisClient *c);
01426b05 770static void hincrbyCommand(redisClient *c);
befec3cd 771static void subscribeCommand(redisClient *c);
772static void unsubscribeCommand(redisClient *c);
ffc6b7f8 773static void psubscribeCommand(redisClient *c);
774static void punsubscribeCommand(redisClient *c);
befec3cd 775static void publishCommand(redisClient *c);
37ab76c9 776static void watchCommand(redisClient *c);
777static void unwatchCommand(redisClient *c);
f6b141c5 778
ed9b544e 779/*================================= Globals ================================= */
780
781/* Global vars */
782static struct redisServer server; /* server global state */
1a132bbc 783static struct redisCommand *commandTable;
1a132bbc 784static struct redisCommand readonlyCommandTable[] = {
76583ea4
PN
785 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
787 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 788 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
789 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
790 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
796 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
dedff272
RP
798 {"rpushx",rpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"lpushx",lpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"linsert",linsertCommand,5,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
801 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
811 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
812 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
814 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
815 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
820 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
821 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
822 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
823 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
824 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
825 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
826 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
827 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
828 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
829 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 831 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
832 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
833 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
838 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
839 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
840 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
841 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 842 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 843 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 844 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 845 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 846 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
847 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
848 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
851 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 852 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
853 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
854 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
855 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
856 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
857 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
858 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
861 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
862 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
863 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
869 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
870 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
872 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
875 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
876 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 877 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
878 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
880 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
882 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
883 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
885 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
886 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
887 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 888 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 889 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
890 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 891 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
892 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 893 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 894 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
d55d5c5d 895 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
ed9b544e 896};
bcfc686d 897
ed9b544e 898/*============================ Utility functions ============================ */
899
900/* Glob-style pattern matching. */
500ece7c 901static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 902 const char *string, int stringLen, int nocase)
903{
904 while(patternLen) {
905 switch(pattern[0]) {
906 case '*':
907 while (pattern[1] == '*') {
908 pattern++;
909 patternLen--;
910 }
911 if (patternLen == 1)
912 return 1; /* match */
913 while(stringLen) {
914 if (stringmatchlen(pattern+1, patternLen-1,
915 string, stringLen, nocase))
916 return 1; /* match */
917 string++;
918 stringLen--;
919 }
920 return 0; /* no match */
921 break;
922 case '?':
923 if (stringLen == 0)
924 return 0; /* no match */
925 string++;
926 stringLen--;
927 break;
928 case '[':
929 {
930 int not, match;
931
932 pattern++;
933 patternLen--;
934 not = pattern[0] == '^';
935 if (not) {
936 pattern++;
937 patternLen--;
938 }
939 match = 0;
940 while(1) {
941 if (pattern[0] == '\\') {
942 pattern++;
943 patternLen--;
944 if (pattern[0] == string[0])
945 match = 1;
946 } else if (pattern[0] == ']') {
947 break;
948 } else if (patternLen == 0) {
949 pattern--;
950 patternLen++;
951 break;
952 } else if (pattern[1] == '-' && patternLen >= 3) {
953 int start = pattern[0];
954 int end = pattern[2];
955 int c = string[0];
956 if (start > end) {
957 int t = start;
958 start = end;
959 end = t;
960 }
961 if (nocase) {
962 start = tolower(start);
963 end = tolower(end);
964 c = tolower(c);
965 }
966 pattern += 2;
967 patternLen -= 2;
968 if (c >= start && c <= end)
969 match = 1;
970 } else {
971 if (!nocase) {
972 if (pattern[0] == string[0])
973 match = 1;
974 } else {
975 if (tolower((int)pattern[0]) == tolower((int)string[0]))
976 match = 1;
977 }
978 }
979 pattern++;
980 patternLen--;
981 }
982 if (not)
983 match = !match;
984 if (!match)
985 return 0; /* no match */
986 string++;
987 stringLen--;
988 break;
989 }
990 case '\\':
991 if (patternLen >= 2) {
992 pattern++;
993 patternLen--;
994 }
995 /* fall through */
996 default:
997 if (!nocase) {
998 if (pattern[0] != string[0])
999 return 0; /* no match */
1000 } else {
1001 if (tolower((int)pattern[0]) != tolower((int)string[0]))
1002 return 0; /* no match */
1003 }
1004 string++;
1005 stringLen--;
1006 break;
1007 }
1008 pattern++;
1009 patternLen--;
1010 if (stringLen == 0) {
1011 while(*pattern == '*') {
1012 pattern++;
1013 patternLen--;
1014 }
1015 break;
1016 }
1017 }
1018 if (patternLen == 0 && stringLen == 0)
1019 return 1;
1020 return 0;
1021}
1022
500ece7c 1023static int stringmatch(const char *pattern, const char *string, int nocase) {
1024 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1025}
1026
2b619329 1027/* Convert a string representing an amount of memory into the number of
1028 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1029 * (1024*1024*1024).
1030 *
1031 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1032 * set to 0 */
1033static long long memtoll(const char *p, int *err) {
1034 const char *u;
1035 char buf[128];
1036 long mul; /* unit multiplier */
1037 long long val;
1038 unsigned int digits;
1039
1040 if (err) *err = 0;
1041 /* Search the first non digit character. */
1042 u = p;
1043 if (*u == '-') u++;
1044 while(*u && isdigit(*u)) u++;
1045 if (*u == '\0' || !strcasecmp(u,"b")) {
1046 mul = 1;
72324005 1047 } else if (!strcasecmp(u,"k")) {
2b619329 1048 mul = 1000;
72324005 1049 } else if (!strcasecmp(u,"kb")) {
2b619329 1050 mul = 1024;
72324005 1051 } else if (!strcasecmp(u,"m")) {
2b619329 1052 mul = 1000*1000;
72324005 1053 } else if (!strcasecmp(u,"mb")) {
2b619329 1054 mul = 1024*1024;
72324005 1055 } else if (!strcasecmp(u,"g")) {
2b619329 1056 mul = 1000L*1000*1000;
72324005 1057 } else if (!strcasecmp(u,"gb")) {
2b619329 1058 mul = 1024L*1024*1024;
1059 } else {
1060 if (err) *err = 1;
1061 mul = 1;
1062 }
1063 digits = u-p;
1064 if (digits >= sizeof(buf)) {
1065 if (err) *err = 1;
1066 return LLONG_MAX;
1067 }
1068 memcpy(buf,p,digits);
1069 buf[digits] = '\0';
1070 val = strtoll(buf,NULL,10);
1071 return val*mul;
1072}
1073
ee14da56 1074/* Convert a long long into a string. Returns the number of
1075 * characters needed to represent the number, that can be shorter if passed
1076 * buffer length is not enough to store the whole number. */
1077static int ll2string(char *s, size_t len, long long value) {
1078 char buf[32], *p;
1079 unsigned long long v;
1080 size_t l;
1081
1082 if (len == 0) return 0;
1083 v = (value < 0) ? -value : value;
1084 p = buf+31; /* point to the last character */
1085 do {
1086 *p-- = '0'+(v%10);
1087 v /= 10;
1088 } while(v);
1089 if (value < 0) *p-- = '-';
1090 p++;
1091 l = 32-(p-buf);
1092 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1093 memcpy(s,p,l);
1094 s[l] = '\0';
1095 return l;
1096}
1097
56906eef 1098static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1099 va_list ap;
1100 FILE *fp;
1101
1102 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1103 if (!fp) return;
1104
1105 va_start(ap, fmt);
1106 if (level >= server.verbosity) {
6766f45e 1107 char *c = ".-*#";
1904ecc1 1108 char buf[64];
1109 time_t now;
1110
1111 now = time(NULL);
6c9385e0 1112 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1113 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1114 vfprintf(fp, fmt, ap);
1115 fprintf(fp,"\n");
1116 fflush(fp);
1117 }
1118 va_end(ap);
1119
1120 if (server.logfile) fclose(fp);
1121}
1122
1123/*====================== Hash table type implementation ==================== */
1124
1125/* This is an hash table type that uses the SDS dynamic strings libary as
1126 * keys and radis objects as values (objects can hold SDS strings,
1127 * lists, sets). */
1128
1812e024 1129static void dictVanillaFree(void *privdata, void *val)
1130{
1131 DICT_NOTUSED(privdata);
1132 zfree(val);
1133}
1134
4409877e 1135static void dictListDestructor(void *privdata, void *val)
1136{
1137 DICT_NOTUSED(privdata);
1138 listRelease((list*)val);
1139}
1140
09241813 1141static int dictSdsKeyCompare(void *privdata, const void *key1,
ed9b544e 1142 const void *key2)
1143{
1144 int l1,l2;
1145 DICT_NOTUSED(privdata);
1146
1147 l1 = sdslen((sds)key1);
1148 l2 = sdslen((sds)key2);
1149 if (l1 != l2) return 0;
1150 return memcmp(key1, key2, l1) == 0;
1151}
1152
1153static void dictRedisObjectDestructor(void *privdata, void *val)
1154{
1155 DICT_NOTUSED(privdata);
1156
a35ddf12 1157 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1158 decrRefCount(val);
1159}
1160
09241813 1161static void dictSdsDestructor(void *privdata, void *val)
1162{
1163 DICT_NOTUSED(privdata);
1164
1165 sdsfree(val);
1166}
1167
942a3961 1168static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1169 const void *key2)
1170{
1171 const robj *o1 = key1, *o2 = key2;
09241813 1172 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
ed9b544e 1173}
1174
942a3961 1175static unsigned int dictObjHash(const void *key) {
ed9b544e 1176 const robj *o = key;
1177 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1178}
1179
09241813 1180static unsigned int dictSdsHash(const void *key) {
1181 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1182}
1183
942a3961 1184static int dictEncObjKeyCompare(void *privdata, const void *key1,
1185 const void *key2)
1186{
9d65a1bb 1187 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1188 int cmp;
942a3961 1189
2a1198b4 1190 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1191 o2->encoding == REDIS_ENCODING_INT)
1192 return o1->ptr == o2->ptr;
2a1198b4 1193
9d65a1bb 1194 o1 = getDecodedObject(o1);
1195 o2 = getDecodedObject(o2);
09241813 1196 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
9d65a1bb 1197 decrRefCount(o1);
1198 decrRefCount(o2);
1199 return cmp;
942a3961 1200}
1201
1202static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1203 robj *o = (robj*) key;
942a3961 1204
ed9e4966 1205 if (o->encoding == REDIS_ENCODING_RAW) {
1206 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1207 } else {
1208 if (o->encoding == REDIS_ENCODING_INT) {
1209 char buf[32];
1210 int len;
1211
ee14da56 1212 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1213 return dictGenHashFunction((unsigned char*)buf, len);
1214 } else {
1215 unsigned int hash;
1216
1217 o = getDecodedObject(o);
1218 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1219 decrRefCount(o);
1220 return hash;
1221 }
1222 }
942a3961 1223}
1224
09241813 1225/* Sets type */
ed9b544e 1226static dictType setDictType = {
942a3961 1227 dictEncObjHash, /* hash function */
ed9b544e 1228 NULL, /* key dup */
1229 NULL, /* val dup */
942a3961 1230 dictEncObjKeyCompare, /* key compare */
ed9b544e 1231 dictRedisObjectDestructor, /* key destructor */
1232 NULL /* val destructor */
1233};
1234
f2d9f50f 1235/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1236static dictType zsetDictType = {
1237 dictEncObjHash, /* hash function */
1238 NULL, /* key dup */
1239 NULL, /* val dup */
1240 dictEncObjKeyCompare, /* key compare */
1241 dictRedisObjectDestructor, /* key destructor */
da0a1620 1242 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1243};
1244
09241813 1245/* Db->dict, keys are sds strings, vals are Redis objects. */
5234952b 1246static dictType dbDictType = {
09241813 1247 dictSdsHash, /* hash function */
ed9b544e 1248 NULL, /* key dup */
1249 NULL, /* val dup */
09241813 1250 dictSdsKeyCompare, /* key compare */
1251 dictSdsDestructor, /* key destructor */
ed9b544e 1252 dictRedisObjectDestructor /* val destructor */
1253};
1254
f2d9f50f 1255/* Db->expires */
1256static dictType keyptrDictType = {
09241813 1257 dictSdsHash, /* hash function */
f2d9f50f 1258 NULL, /* key dup */
1259 NULL, /* val dup */
09241813 1260 dictSdsKeyCompare, /* key compare */
829137b9 1261 NULL, /* key destructor */
f2d9f50f 1262 NULL /* val destructor */
1263};
1264
5234952b 1265/* Hash type hash table (note that small hashes are represented with zimpaps) */
1266static dictType hashDictType = {
1267 dictEncObjHash, /* hash function */
1268 NULL, /* key dup */
1269 NULL, /* val dup */
1270 dictEncObjKeyCompare, /* key compare */
1271 dictRedisObjectDestructor, /* key destructor */
1272 dictRedisObjectDestructor /* val destructor */
1273};
1274
4409877e 1275/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1276 * lists as values. It's used for blocking operations (BLPOP) and to
1277 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1278static dictType keylistDictType = {
1279 dictObjHash, /* hash function */
1280 NULL, /* key dup */
1281 NULL, /* val dup */
1282 dictObjKeyCompare, /* key compare */
1283 dictRedisObjectDestructor, /* key destructor */
1284 dictListDestructor /* val destructor */
1285};
1286
42ab0172
AO
1287static void version();
1288
ed9b544e 1289/* ========================= Random utility functions ======================= */
1290
1291/* Redis generally does not try to recover from out of memory conditions
1292 * when allocating objects or strings, it is not clear if it will be possible
1293 * to report this condition to the client since the networking layer itself
1294 * is based on heap allocation for send buffers, so we simply abort.
1295 * At least the code will be simpler to read... */
1296static void oom(const char *msg) {
71c54b21 1297 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1298 sleep(1);
1299 abort();
1300}
1301
1302/* ====================== Redis server networking stuff ===================== */
56906eef 1303static void closeTimedoutClients(void) {
ed9b544e 1304 redisClient *c;
ed9b544e 1305 listNode *ln;
1306 time_t now = time(NULL);
c7df85a4 1307 listIter li;
ed9b544e 1308
c7df85a4 1309 listRewind(server.clients,&li);
1310 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1311 c = listNodeValue(ln);
f86a74e9 1312 if (server.maxidletime &&
1313 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1314 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1315 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1316 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1317 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1318 {
f870935d 1319 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1320 freeClient(c);
f86a74e9 1321 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1322 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1323 addReply(c,shared.nullmultibulk);
b0d8747d 1324 unblockClientWaitingData(c);
f86a74e9 1325 }
ed9b544e 1326 }
1327 }
ed9b544e 1328}
1329
12fea928 1330static int htNeedsResize(dict *dict) {
1331 long long size, used;
1332
1333 size = dictSlots(dict);
1334 used = dictSize(dict);
1335 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1336 (used*100/size < REDIS_HT_MINFILL));
1337}
1338
0bc03378 1339/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1340 * we resize the hash table to save memory */
56906eef 1341static void tryResizeHashTables(void) {
0bc03378 1342 int j;
1343
1344 for (j = 0; j < server.dbnum; j++) {
5413c40d 1345 if (htNeedsResize(server.db[j].dict))
0bc03378 1346 dictResize(server.db[j].dict);
12fea928 1347 if (htNeedsResize(server.db[j].expires))
1348 dictResize(server.db[j].expires);
0bc03378 1349 }
1350}
1351
8ca3e9d1 1352/* Our hash table implementation performs rehashing incrementally while
1353 * we write/read from the hash table. Still if the server is idle, the hash
1354 * table will use two tables for a long time. So we try to use 1 millisecond
1355 * of CPU time at every serverCron() loop in order to rehash some key. */
1356static void incrementallyRehash(void) {
1357 int j;
1358
1359 for (j = 0; j < server.dbnum; j++) {
1360 if (dictIsRehashing(server.db[j].dict)) {
1361 dictRehashMilliseconds(server.db[j].dict,1);
1362 break; /* already used our millisecond for this loop... */
1363 }
1364 }
1365}
1366
9d65a1bb 1367/* A background saving child (BGSAVE) terminated its work. Handle this. */
1368void backgroundSaveDoneHandler(int statloc) {
1369 int exitcode = WEXITSTATUS(statloc);
1370 int bysignal = WIFSIGNALED(statloc);
1371
1372 if (!bysignal && exitcode == 0) {
1373 redisLog(REDIS_NOTICE,
1374 "Background saving terminated with success");
1375 server.dirty = 0;
1376 server.lastsave = time(NULL);
1377 } else if (!bysignal && exitcode != 0) {
1378 redisLog(REDIS_WARNING, "Background saving error");
1379 } else {
1380 redisLog(REDIS_WARNING,
454eea7c 1381 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1382 rdbRemoveTempFile(server.bgsavechildpid);
1383 }
1384 server.bgsavechildpid = -1;
1385 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1386 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1387 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1388}
1389
1390/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1391 * Handle this. */
1392void backgroundRewriteDoneHandler(int statloc) {
1393 int exitcode = WEXITSTATUS(statloc);
1394 int bysignal = WIFSIGNALED(statloc);
1395
1396 if (!bysignal && exitcode == 0) {
1397 int fd;
1398 char tmpfile[256];
1399
1400 redisLog(REDIS_NOTICE,
1401 "Background append only file rewriting terminated with success");
1402 /* Now it's time to flush the differences accumulated by the parent */
1403 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1404 fd = open(tmpfile,O_WRONLY|O_APPEND);
1405 if (fd == -1) {
1406 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1407 goto cleanup;
1408 }
1409 /* Flush our data... */
1410 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1411 (signed) sdslen(server.bgrewritebuf)) {
1412 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1413 close(fd);
1414 goto cleanup;
1415 }
b32627cd 1416 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1417 /* Now our work is to rename the temp file into the stable file. And
1418 * switch the file descriptor used by the server for append only. */
1419 if (rename(tmpfile,server.appendfilename) == -1) {
1420 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1421 close(fd);
1422 goto cleanup;
1423 }
1424 /* Mission completed... almost */
1425 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1426 if (server.appendfd != -1) {
1427 /* If append only is actually enabled... */
1428 close(server.appendfd);
1429 server.appendfd = fd;
d5d23dab 1430 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
85a83172 1431 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1432 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1433 } else {
1434 /* If append only is disabled we just generate a dump in this
1435 * format. Why not? */
1436 close(fd);
1437 }
1438 } else if (!bysignal && exitcode != 0) {
1439 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1440 } else {
1441 redisLog(REDIS_WARNING,
454eea7c 1442 "Background append only file rewriting terminated by signal %d",
1443 WTERMSIG(statloc));
9d65a1bb 1444 }
1445cleanup:
1446 sdsfree(server.bgrewritebuf);
1447 server.bgrewritebuf = sdsempty();
1448 aofRemoveTempFile(server.bgrewritechildpid);
1449 server.bgrewritechildpid = -1;
1450}
1451
884d4b39 1452/* This function is called once a background process of some kind terminates,
1453 * as we want to avoid resizing the hash tables when there is a child in order
1454 * to play well with copy-on-write (otherwise when a resize happens lots of
1455 * memory pages are copied). The goal of this function is to update the ability
1456 * for dict.c to resize the hash tables accordingly to the fact we have o not
1457 * running childs. */
1458static void updateDictResizePolicy(void) {
1459 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1460 dictEnableResize();
1461 else
1462 dictDisableResize();
1463}
1464
56906eef 1465static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1466 int j, loops = server.cronloops++;
ed9b544e 1467 REDIS_NOTUSED(eventLoop);
1468 REDIS_NOTUSED(id);
1469 REDIS_NOTUSED(clientData);
1470
3a66edc7 1471 /* We take a cached value of the unix time in the global state because
1472 * with virtual memory and aging there is to store the current time
1473 * in objects at every object access, and accuracy is not needed.
1474 * To access a global var is faster than calling time(NULL) */
1475 server.unixtime = time(NULL);
560db612 1476 /* We have just 21 bits per object for LRU information.
1477 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1478 *
1479 * When we need to select what object to swap, we compute the minimum
1480 * time distance between the current lruclock and the object last access
1481 * lruclock info. Even if clocks will wrap on overflow, there is
1482 * the interesting property that we are sure that at least
1483 * ABS(A-B) minutes passed between current time and timestamp B.
1484 *
1485 * This is not precise but we don't need at all precision, but just
1486 * something statistically reasonable.
1487 */
1488 server.lruclock = (time(NULL)/60)&((1<<21)-1);
3a66edc7 1489
fab43727 1490 /* We received a SIGTERM, shutting down here in a safe way, as it is
1491 * not ok doing so inside the signal handler. */
1492 if (server.shutdown_asap) {
1493 if (prepareForShutdown() == REDIS_OK) exit(0);
1494 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1495 }
1496
0bc03378 1497 /* Show some info about non-empty databases */
ed9b544e 1498 for (j = 0; j < server.dbnum; j++) {
dec423d9 1499 long long size, used, vkeys;
94754ccc 1500
3305306f 1501 size = dictSlots(server.db[j].dict);
1502 used = dictSize(server.db[j].dict);
94754ccc 1503 vkeys = dictSize(server.db[j].expires);
1763929f 1504 if (!(loops % 50) && (used || vkeys)) {
f870935d 1505 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1506 /* dictPrintStats(server.dict); */
ed9b544e 1507 }
ed9b544e 1508 }
1509
0bc03378 1510 /* We don't want to resize the hash tables while a bacground saving
1511 * is in progress: the saving child is created using fork() that is
1512 * implemented with a copy-on-write semantic in most modern systems, so
1513 * if we resize the HT while there is the saving child at work actually
1514 * a lot of memory movements in the parent will cause a lot of pages
1515 * copied. */
8ca3e9d1 1516 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1517 if (!(loops % 10)) tryResizeHashTables();
1518 if (server.activerehashing) incrementallyRehash();
884d4b39 1519 }
0bc03378 1520
ed9b544e 1521 /* Show information about connected clients */
1763929f 1522 if (!(loops % 50)) {
bdcb92f2 1523 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1524 listLength(server.clients)-listLength(server.slaves),
1525 listLength(server.slaves),
bdcb92f2 1526 zmalloc_used_memory());
ed9b544e 1527 }
1528
1529 /* Close connections of timedout clients */
1763929f 1530 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1531 closeTimedoutClients();
1532
9d65a1bb 1533 /* Check if a background saving or AOF rewrite in progress terminated */
1534 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1535 int statloc;
9d65a1bb 1536 pid_t pid;
1537
1538 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1539 if (pid == server.bgsavechildpid) {
1540 backgroundSaveDoneHandler(statloc);
ed9b544e 1541 } else {
9d65a1bb 1542 backgroundRewriteDoneHandler(statloc);
ed9b544e 1543 }
884d4b39 1544 updateDictResizePolicy();
ed9b544e 1545 }
1546 } else {
1547 /* If there is not a background saving in progress check if
1548 * we have to save now */
1549 time_t now = time(NULL);
1550 for (j = 0; j < server.saveparamslen; j++) {
1551 struct saveparam *sp = server.saveparams+j;
1552
1553 if (server.dirty >= sp->changes &&
1554 now-server.lastsave > sp->seconds) {
1555 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1556 sp->changes, sp->seconds);
f78fd11b 1557 rdbSaveBackground(server.dbfilename);
ed9b544e 1558 break;
1559 }
1560 }
1561 }
94754ccc 1562
f2324293 1563 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1564 * will use few CPU cycles if there are few expiring keys, otherwise
1565 * it will get more aggressive to avoid that too much memory is used by
1566 * keys that can be removed from the keyspace. */
94754ccc 1567 for (j = 0; j < server.dbnum; j++) {
f2324293 1568 int expired;
94754ccc 1569 redisDb *db = server.db+j;
94754ccc 1570
f2324293 1571 /* Continue to expire if at the end of the cycle more than 25%
1572 * of the keys were expired. */
1573 do {
4ef8de8a 1574 long num = dictSize(db->expires);
94754ccc 1575 time_t now = time(NULL);
1576
f2324293 1577 expired = 0;
94754ccc 1578 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1579 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1580 while (num--) {
1581 dictEntry *de;
1582 time_t t;
1583
1584 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1585 t = (time_t) dictGetEntryVal(de);
1586 if (now > t) {
09241813 1587 sds key = dictGetEntryKey(de);
1588 robj *keyobj = createStringObject(key,sdslen(key));
1589
1590 dbDelete(db,keyobj);
1591 decrRefCount(keyobj);
f2324293 1592 expired++;
2a6a2ed1 1593 server.stat_expiredkeys++;
94754ccc 1594 }
1595 }
f2324293 1596 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1597 }
1598
4ef8de8a 1599 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1600 * is enbled. Try to free objects from the free list first. */
7e69548d 1601 if (vmCanSwapOut()) {
1602 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1603 server.vm_max_memory)
1604 {
72e9fd40 1605 int retval;
1606
a5819310 1607 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1608 retval = (server.vm_max_threads == 0) ?
1609 vmSwapOneObjectBlocking() :
1610 vmSwapOneObjectThreaded();
1763929f 1611 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1612 zmalloc_used_memory() >
1613 (server.vm_max_memory+server.vm_max_memory/10))
1614 {
1615 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1616 }
72e9fd40 1617 /* Note that when using threade I/O we free just one object,
1618 * because anyway when the I/O thread in charge to swap this
1619 * object out will finish, the handler of completed jobs
1620 * will try to swap more objects if we are still out of memory. */
1621 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1622 }
1623 }
1624
ed9b544e 1625 /* Check if we should connect to a MASTER */
1763929f 1626 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1627 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1628 if (syncWithMaster() == REDIS_OK) {
1629 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1630 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1631 }
1632 }
1763929f 1633 return 100;
ed9b544e 1634}
1635
d5d55fc3 1636/* This function gets called every time Redis is entering the
1637 * main loop of the event driven library, that is, before to sleep
1638 * for ready file descriptors. */
1639static void beforeSleep(struct aeEventLoop *eventLoop) {
1640 REDIS_NOTUSED(eventLoop);
1641
28ed1f33 1642 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1643 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1644 listIter li;
1645 listNode *ln;
1646
1647 listRewind(server.io_ready_clients,&li);
1648 while((ln = listNext(&li))) {
1649 redisClient *c = ln->value;
1650 struct redisCommand *cmd;
1651
1652 /* Resume the client. */
1653 listDelNode(server.io_ready_clients,ln);
1654 c->flags &= (~REDIS_IO_WAIT);
1655 server.vm_blocked_clients--;
1656 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1657 readQueryFromClient, c);
1658 cmd = lookupCommand(c->argv[0]->ptr);
1659 assert(cmd != NULL);
1660 call(c,cmd);
1661 resetClient(c);
1662 /* There may be more data to process in the input buffer. */
1663 if (c->querybuf && sdslen(c->querybuf) > 0)
1664 processInputBuffer(c);
1665 }
1666 }
28ed1f33 1667 /* Write the AOF buffer on disk */
1668 flushAppendOnlyFile();
d5d55fc3 1669}
1670
ed9b544e 1671static void createSharedObjects(void) {
05df7621 1672 int j;
1673
ed9b544e 1674 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1675 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1676 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1677 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1678 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1679 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
23d3a5fe 1680 shared.cnegone = createObject(REDIS_STRING,sdsnew(":-1\r\n"));
c937aa89 1681 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1682 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1683 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1684 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1685 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1686 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1687 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1688 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1689 "-ERR no such key\r\n"));
ed9b544e 1690 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1691 "-ERR syntax error\r\n"));
c937aa89 1692 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1693 "-ERR source and destination objects are the same\r\n"));
1694 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1695 "-ERR index out of range\r\n"));
ed9b544e 1696 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1697 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1698 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1699 shared.select0 = createStringObject("select 0\r\n",10);
1700 shared.select1 = createStringObject("select 1\r\n",10);
1701 shared.select2 = createStringObject("select 2\r\n",10);
1702 shared.select3 = createStringObject("select 3\r\n",10);
1703 shared.select4 = createStringObject("select 4\r\n",10);
1704 shared.select5 = createStringObject("select 5\r\n",10);
1705 shared.select6 = createStringObject("select 6\r\n",10);
1706 shared.select7 = createStringObject("select 7\r\n",10);
1707 shared.select8 = createStringObject("select 8\r\n",10);
1708 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1709 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1710 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1711 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1712 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1713 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1714 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1715 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1716 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1717 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1718 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1719 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1720 }
ed9b544e 1721}
1722
1723static void appendServerSaveParams(time_t seconds, int changes) {
1724 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1725 server.saveparams[server.saveparamslen].seconds = seconds;
1726 server.saveparams[server.saveparamslen].changes = changes;
1727 server.saveparamslen++;
1728}
1729
bcfc686d 1730static void resetServerSaveParams() {
ed9b544e 1731 zfree(server.saveparams);
1732 server.saveparams = NULL;
1733 server.saveparamslen = 0;
1734}
1735
1736static void initServerConfig() {
1737 server.dbnum = REDIS_DEFAULT_DBNUM;
1738 server.port = REDIS_SERVERPORT;
f870935d 1739 server.verbosity = REDIS_VERBOSE;
ed9b544e 1740 server.maxidletime = REDIS_MAXIDLETIME;
1741 server.saveparams = NULL;
1742 server.logfile = NULL; /* NULL = log on standard output */
1743 server.bindaddr = NULL;
1744 server.glueoutputbuf = 1;
1745 server.daemonize = 0;
44b38ef4 1746 server.appendonly = 0;
1b677732 1747 server.appendfsync = APPENDFSYNC_EVERYSEC;
38db9171 1748 server.no_appendfsync_on_rewrite = 0;
48f0308a 1749 server.lastfsync = time(NULL);
44b38ef4 1750 server.appendfd = -1;
1751 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1752 server.pidfile = zstrdup("/var/run/redis.pid");
1753 server.dbfilename = zstrdup("dump.rdb");
1754 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1755 server.requirepass = NULL;
b0553789 1756 server.rdbcompression = 1;
8ca3e9d1 1757 server.activerehashing = 1;
285add55 1758 server.maxclients = 0;
d5d55fc3 1759 server.blpop_blocked_clients = 0;
3fd78bcd 1760 server.maxmemory = 0;
75680a3c 1761 server.vm_enabled = 0;
054e426d 1762 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1763 server.vm_page_size = 256; /* 256 bytes per page */
1764 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1765 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1766 server.vm_max_threads = 4;
d5d55fc3 1767 server.vm_blocked_clients = 0;
cbba7dd7 1768 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1769 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
d0686e07
PN
1770 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1771 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
fab43727 1772 server.shutdown_asap = 0;
75680a3c 1773
bcfc686d 1774 resetServerSaveParams();
ed9b544e 1775
1776 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1777 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1778 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1779 /* Replication related */
1780 server.isslave = 0;
d0ccebcf 1781 server.masterauth = NULL;
ed9b544e 1782 server.masterhost = NULL;
1783 server.masterport = 6379;
1784 server.master = NULL;
1785 server.replstate = REDIS_REPL_NONE;
a7866db6 1786
1787 /* Double constants initialization */
1788 R_Zero = 0.0;
1789 R_PosInf = 1.0/R_Zero;
1790 R_NegInf = -1.0/R_Zero;
1791 R_Nan = R_Zero/R_Zero;
ed9b544e 1792}
1793
1794static void initServer() {
1795 int j;
1796
1797 signal(SIGHUP, SIG_IGN);
1798 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1799 setupSigSegvAction();
ed9b544e 1800
b9bc0eef 1801 server.devnull = fopen("/dev/null","w");
1802 if (server.devnull == NULL) {
1803 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1804 exit(1);
1805 }
ed9b544e 1806 server.clients = listCreate();
1807 server.slaves = listCreate();
87eca727 1808 server.monitors = listCreate();
ed9b544e 1809 server.objfreelist = listCreate();
1810 createSharedObjects();
1811 server.el = aeCreateEventLoop();
3305306f 1812 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1813 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1814 if (server.fd == -1) {
1815 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1816 exit(1);
1817 }
3305306f 1818 for (j = 0; j < server.dbnum; j++) {
5234952b 1819 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1820 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1821 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1822 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1823 if (server.vm_enabled)
1824 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1825 server.db[j].id = j;
1826 }
ffc6b7f8 1827 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1828 server.pubsub_patterns = listCreate();
1829 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1830 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1831 server.cronloops = 0;
9f3c422c 1832 server.bgsavechildpid = -1;
9d65a1bb 1833 server.bgrewritechildpid = -1;
1834 server.bgrewritebuf = sdsempty();
28ed1f33 1835 server.aofbuf = sdsempty();
ed9b544e 1836 server.lastsave = time(NULL);
1837 server.dirty = 0;
ed9b544e 1838 server.stat_numcommands = 0;
1839 server.stat_numconnections = 0;
2a6a2ed1 1840 server.stat_expiredkeys = 0;
ed9b544e 1841 server.stat_starttime = time(NULL);
3a66edc7 1842 server.unixtime = time(NULL);
d8f8b666 1843 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1844 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1845 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1846
1847 if (server.appendonly) {
3bb225d6 1848 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1849 if (server.appendfd == -1) {
1850 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1851 strerror(errno));
1852 exit(1);
1853 }
1854 }
75680a3c 1855
1856 if (server.vm_enabled) vmInit();
ed9b544e 1857}
1858
1859/* Empty the whole database */
ca37e9cd 1860static long long emptyDb() {
ed9b544e 1861 int j;
ca37e9cd 1862 long long removed = 0;
ed9b544e 1863
3305306f 1864 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1865 removed += dictSize(server.db[j].dict);
3305306f 1866 dictEmpty(server.db[j].dict);
1867 dictEmpty(server.db[j].expires);
1868 }
ca37e9cd 1869 return removed;
ed9b544e 1870}
1871
85dd2f3a 1872static int yesnotoi(char *s) {
1873 if (!strcasecmp(s,"yes")) return 1;
1874 else if (!strcasecmp(s,"no")) return 0;
1875 else return -1;
1876}
1877
ed9b544e 1878/* I agree, this is a very rudimental way to load a configuration...
1879 will improve later if the config gets more complex */
1880static void loadServerConfig(char *filename) {
c9a111ac 1881 FILE *fp;
ed9b544e 1882 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1883 int linenum = 0;
1884 sds line = NULL;
c9a111ac 1885
1886 if (filename[0] == '-' && filename[1] == '\0')
1887 fp = stdin;
1888 else {
1889 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1890 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1891 exit(1);
1892 }
ed9b544e 1893 }
c9a111ac 1894
ed9b544e 1895 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1896 sds *argv;
1897 int argc, j;
1898
1899 linenum++;
1900 line = sdsnew(buf);
1901 line = sdstrim(line," \t\r\n");
1902
1903 /* Skip comments and blank lines*/
1904 if (line[0] == '#' || line[0] == '\0') {
1905 sdsfree(line);
1906 continue;
1907 }
1908
1909 /* Split into arguments */
1910 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1911 sdstolower(argv[0]);
1912
1913 /* Execute config directives */
bb0b03a3 1914 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1915 server.maxidletime = atoi(argv[1]);
0150db36 1916 if (server.maxidletime < 0) {
ed9b544e 1917 err = "Invalid timeout value"; goto loaderr;
1918 }
bb0b03a3 1919 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1920 server.port = atoi(argv[1]);
1921 if (server.port < 1 || server.port > 65535) {
1922 err = "Invalid port"; goto loaderr;
1923 }
bb0b03a3 1924 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1925 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1926 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1927 int seconds = atoi(argv[1]);
1928 int changes = atoi(argv[2]);
1929 if (seconds < 1 || changes < 0) {
1930 err = "Invalid save parameters"; goto loaderr;
1931 }
1932 appendServerSaveParams(seconds,changes);
bb0b03a3 1933 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1934 if (chdir(argv[1]) == -1) {
1935 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1936 argv[1], strerror(errno));
1937 exit(1);
1938 }
bb0b03a3 1939 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1940 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1941 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1942 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1943 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1944 else {
1945 err = "Invalid log level. Must be one of debug, notice, warning";
1946 goto loaderr;
1947 }
bb0b03a3 1948 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1949 FILE *logfp;
ed9b544e 1950
1951 server.logfile = zstrdup(argv[1]);
bb0b03a3 1952 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1953 zfree(server.logfile);
1954 server.logfile = NULL;
1955 }
1956 if (server.logfile) {
1957 /* Test if we are able to open the file. The server will not
1958 * be able to abort just for this problem later... */
c9a111ac 1959 logfp = fopen(server.logfile,"a");
1960 if (logfp == NULL) {
ed9b544e 1961 err = sdscatprintf(sdsempty(),
1962 "Can't open the log file: %s", strerror(errno));
1963 goto loaderr;
1964 }
c9a111ac 1965 fclose(logfp);
ed9b544e 1966 }
bb0b03a3 1967 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1968 server.dbnum = atoi(argv[1]);
1969 if (server.dbnum < 1) {
1970 err = "Invalid number of databases"; goto loaderr;
1971 }
b3f83f12
JZ
1972 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1973 loadServerConfig(argv[1]);
285add55 1974 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1975 server.maxclients = atoi(argv[1]);
3fd78bcd 1976 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1977 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1978 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1979 server.masterhost = sdsnew(argv[1]);
1980 server.masterport = atoi(argv[2]);
1981 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1982 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1983 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1984 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1985 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1986 err = "argument must be 'yes' or 'no'"; goto loaderr;
1987 }
121f70cf 1988 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1989 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1990 err = "argument must be 'yes' or 'no'"; goto loaderr;
1991 }
1992 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1993 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1994 err = "argument must be 'yes' or 'no'"; goto loaderr;
1995 }
bb0b03a3 1996 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1997 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1998 err = "argument must be 'yes' or 'no'"; goto loaderr;
1999 }
44b38ef4 2000 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
2001 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
2002 err = "argument must be 'yes' or 'no'"; goto loaderr;
2003 }
f3b52411
PN
2004 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
2005 zfree(server.appendfilename);
2006 server.appendfilename = zstrdup(argv[1]);
38db9171 2007 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2008 && argc == 2) {
2009 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2010 err = "argument must be 'yes' or 'no'"; goto loaderr;
2011 }
48f0308a 2012 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 2013 if (!strcasecmp(argv[1],"no")) {
48f0308a 2014 server.appendfsync = APPENDFSYNC_NO;
1766c6da 2015 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 2016 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 2017 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 2018 server.appendfsync = APPENDFSYNC_EVERYSEC;
2019 } else {
2020 err = "argument must be 'no', 'always' or 'everysec'";
2021 goto loaderr;
2022 }
bb0b03a3 2023 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 2024 server.requirepass = zstrdup(argv[1]);
bb0b03a3 2025 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 2026 zfree(server.pidfile);
054e426d 2027 server.pidfile = zstrdup(argv[1]);
bb0b03a3 2028 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 2029 zfree(server.dbfilename);
054e426d 2030 server.dbfilename = zstrdup(argv[1]);
75680a3c 2031 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2032 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2033 err = "argument must be 'yes' or 'no'"; goto loaderr;
2034 }
054e426d 2035 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 2036 zfree(server.vm_swap_file);
054e426d 2037 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 2038 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 2039 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 2040 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 2041 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 2042 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 2043 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 2044 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2045 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 2046 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 2047 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 2048 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 2049 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
d0686e07
PN
2050 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2051 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2052 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2053 server.list_max_ziplist_value = memtoll(argv[1], NULL);
ed9b544e 2054 } else {
2055 err = "Bad directive or wrong number of arguments"; goto loaderr;
2056 }
2057 for (j = 0; j < argc; j++)
2058 sdsfree(argv[j]);
2059 zfree(argv);
2060 sdsfree(line);
2061 }
c9a111ac 2062 if (fp != stdin) fclose(fp);
ed9b544e 2063 return;
2064
2065loaderr:
2066 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2067 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2068 fprintf(stderr, ">>> '%s'\n", line);
2069 fprintf(stderr, "%s\n", err);
2070 exit(1);
2071}
2072
2073static void freeClientArgv(redisClient *c) {
2074 int j;
2075
2076 for (j = 0; j < c->argc; j++)
2077 decrRefCount(c->argv[j]);
e8a74421 2078 for (j = 0; j < c->mbargc; j++)
2079 decrRefCount(c->mbargv[j]);
ed9b544e 2080 c->argc = 0;
e8a74421 2081 c->mbargc = 0;
ed9b544e 2082}
2083
2084static void freeClient(redisClient *c) {
2085 listNode *ln;
2086
4409877e 2087 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2088 * call, we have to set querybuf to NULL *before* to call
2089 * unblockClientWaitingData() to avoid processInputBuffer() will get
2090 * called. Also it is important to remove the file events after
2091 * this, because this call adds the READABLE event. */
4409877e 2092 sdsfree(c->querybuf);
2093 c->querybuf = NULL;
2094 if (c->flags & REDIS_BLOCKED)
b0d8747d 2095 unblockClientWaitingData(c);
4409877e 2096
37ab76c9 2097 /* UNWATCH all the keys */
2098 unwatchAllKeys(c);
2099 listRelease(c->watched_keys);
ffc6b7f8 2100 /* Unsubscribe from all the pubsub channels */
2101 pubsubUnsubscribeAllChannels(c,0);
2102 pubsubUnsubscribeAllPatterns(c,0);
2103 dictRelease(c->pubsub_channels);
2104 listRelease(c->pubsub_patterns);
befec3cd 2105 /* Obvious cleanup */
ed9b544e 2106 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2107 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2108 listRelease(c->reply);
2109 freeClientArgv(c);
2110 close(c->fd);
92f8e882 2111 /* Remove from the list of clients */
ed9b544e 2112 ln = listSearchKey(server.clients,c);
dfc5e96c 2113 redisAssert(ln != NULL);
ed9b544e 2114 listDelNode(server.clients,ln);
37ab76c9 2115 /* Remove from the list of clients that are now ready to be restarted
2116 * after waiting for swapped keys */
d5d55fc3 2117 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2118 ln = listSearchKey(server.io_ready_clients,c);
2119 if (ln) {
2120 listDelNode(server.io_ready_clients,ln);
2121 server.vm_blocked_clients--;
2122 }
2123 }
37ab76c9 2124 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2125 while (server.vm_enabled && listLength(c->io_keys)) {
2126 ln = listFirst(c->io_keys);
2127 dontWaitForSwappedKey(c,ln->value);
92f8e882 2128 }
b3e3d0d7 2129 listRelease(c->io_keys);
befec3cd 2130 /* Master/slave cleanup */
ed9b544e 2131 if (c->flags & REDIS_SLAVE) {
6208b3a7 2132 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2133 close(c->repldbfd);
87eca727 2134 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2135 ln = listSearchKey(l,c);
dfc5e96c 2136 redisAssert(ln != NULL);
87eca727 2137 listDelNode(l,ln);
ed9b544e 2138 }
2139 if (c->flags & REDIS_MASTER) {
2140 server.master = NULL;
2141 server.replstate = REDIS_REPL_CONNECT;
2142 }
befec3cd 2143 /* Release memory */
93ea3759 2144 zfree(c->argv);
e8a74421 2145 zfree(c->mbargv);
6e469882 2146 freeClientMultiState(c);
ed9b544e 2147 zfree(c);
2148}
2149
cc30e368 2150#define GLUEREPLY_UP_TO (1024)
ed9b544e 2151static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2152 int copylen = 0;
2153 char buf[GLUEREPLY_UP_TO];
6208b3a7 2154 listNode *ln;
c7df85a4 2155 listIter li;
ed9b544e 2156 robj *o;
2157
c7df85a4 2158 listRewind(c->reply,&li);
2159 while((ln = listNext(&li))) {
c28b42ac 2160 int objlen;
2161
ed9b544e 2162 o = ln->value;
c28b42ac 2163 objlen = sdslen(o->ptr);
2164 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2165 memcpy(buf+copylen,o->ptr,objlen);
2166 copylen += objlen;
ed9b544e 2167 listDelNode(c->reply,ln);
c28b42ac 2168 } else {
2169 if (copylen == 0) return;
2170 break;
ed9b544e 2171 }
ed9b544e 2172 }
c28b42ac 2173 /* Now the output buffer is empty, add the new single element */
2174 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2175 listAddNodeHead(c->reply,o);
ed9b544e 2176}
2177
2178static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2179 redisClient *c = privdata;
2180 int nwritten = 0, totwritten = 0, objlen;
2181 robj *o;
2182 REDIS_NOTUSED(el);
2183 REDIS_NOTUSED(mask);
2184
2895e862 2185 /* Use writev() if we have enough buffers to send */
7ea870c0 2186 if (!server.glueoutputbuf &&
e0a62c7f 2187 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2188 !(c->flags & REDIS_MASTER))
2895e862 2189 {
2190 sendReplyToClientWritev(el, fd, privdata, mask);
2191 return;
2192 }
2895e862 2193
ed9b544e 2194 while(listLength(c->reply)) {
c28b42ac 2195 if (server.glueoutputbuf && listLength(c->reply) > 1)
2196 glueReplyBuffersIfNeeded(c);
2197
ed9b544e 2198 o = listNodeValue(listFirst(c->reply));
2199 objlen = sdslen(o->ptr);
2200
2201 if (objlen == 0) {
2202 listDelNode(c->reply,listFirst(c->reply));
2203 continue;
2204 }
2205
2206 if (c->flags & REDIS_MASTER) {
6f376729 2207 /* Don't reply to a master */
ed9b544e 2208 nwritten = objlen - c->sentlen;
2209 } else {
a4d1ba9a 2210 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2211 if (nwritten <= 0) break;
2212 }
2213 c->sentlen += nwritten;
2214 totwritten += nwritten;
2215 /* If we fully sent the object on head go to the next one */
2216 if (c->sentlen == objlen) {
2217 listDelNode(c->reply,listFirst(c->reply));
2218 c->sentlen = 0;
2219 }
6f376729 2220 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2221 * bytes, in a single threaded server it's a good idea to serve
6f376729 2222 * other clients as well, even if a very large request comes from
2223 * super fast link that is always able to accept data (in real world
12f9d551 2224 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2225 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2226 }
2227 if (nwritten == -1) {
2228 if (errno == EAGAIN) {
2229 nwritten = 0;
2230 } else {
f870935d 2231 redisLog(REDIS_VERBOSE,
ed9b544e 2232 "Error writing to client: %s", strerror(errno));
2233 freeClient(c);
2234 return;
2235 }
2236 }
2237 if (totwritten > 0) c->lastinteraction = time(NULL);
2238 if (listLength(c->reply) == 0) {
2239 c->sentlen = 0;
2240 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2241 }
2242}
2243
2895e862 2244static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2245{
2246 redisClient *c = privdata;
2247 int nwritten = 0, totwritten = 0, objlen, willwrite;
2248 robj *o;
2249 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2250 int offset, ion = 0;
2251 REDIS_NOTUSED(el);
2252 REDIS_NOTUSED(mask);
2253
2254 listNode *node;
2255 while (listLength(c->reply)) {
2256 offset = c->sentlen;
2257 ion = 0;
2258 willwrite = 0;
2259
2260 /* fill-in the iov[] array */
2261 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2262 o = listNodeValue(node);
2263 objlen = sdslen(o->ptr);
2264
e0a62c7f 2265 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2266 break;
2267
2268 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2269 break; /* no more iovecs */
2270
2271 iov[ion].iov_base = ((char*)o->ptr) + offset;
2272 iov[ion].iov_len = objlen - offset;
2273 willwrite += objlen - offset;
2274 offset = 0; /* just for the first item */
2275 ion++;
2276 }
2277
2278 if(willwrite == 0)
2279 break;
2280
2281 /* write all collected blocks at once */
2282 if((nwritten = writev(fd, iov, ion)) < 0) {
2283 if (errno != EAGAIN) {
f870935d 2284 redisLog(REDIS_VERBOSE,
2895e862 2285 "Error writing to client: %s", strerror(errno));
2286 freeClient(c);
2287 return;
2288 }
2289 break;
2290 }
2291
2292 totwritten += nwritten;
2293 offset = c->sentlen;
2294
2295 /* remove written robjs from c->reply */
2296 while (nwritten && listLength(c->reply)) {
2297 o = listNodeValue(listFirst(c->reply));
2298 objlen = sdslen(o->ptr);
2299
2300 if(nwritten >= objlen - offset) {
2301 listDelNode(c->reply, listFirst(c->reply));
2302 nwritten -= objlen - offset;
2303 c->sentlen = 0;
2304 } else {
2305 /* partial write */
2306 c->sentlen += nwritten;
2307 break;
2308 }
2309 offset = 0;
2310 }
2311 }
2312
e0a62c7f 2313 if (totwritten > 0)
2895e862 2314 c->lastinteraction = time(NULL);
2315
2316 if (listLength(c->reply) == 0) {
2317 c->sentlen = 0;
2318 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2319 }
2320}
2321
1a132bbc
PN
2322static int qsortRedisCommands(const void *r1, const void *r2) {
2323 return strcasecmp(
2324 ((struct redisCommand*)r1)->name,
2325 ((struct redisCommand*)r2)->name);
2326}
2327
2328static void sortCommandTable() {
1a132bbc
PN
2329 /* Copy and sort the read-only version of the command table */
2330 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2331 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
d55d5c5d 2332 qsort(commandTable,
2333 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2334 sizeof(struct redisCommand),qsortRedisCommands);
1a132bbc
PN
2335}
2336
ed9b544e 2337static struct redisCommand *lookupCommand(char *name) {
1a132bbc
PN
2338 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2339 return bsearch(
2340 &tmp,
2341 commandTable,
d55d5c5d 2342 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
1a132bbc
PN
2343 sizeof(struct redisCommand),
2344 qsortRedisCommands);
ed9b544e 2345}
2346
2347/* resetClient prepare the client to process the next command */
2348static void resetClient(redisClient *c) {
2349 freeClientArgv(c);
2350 c->bulklen = -1;
e8a74421 2351 c->multibulk = 0;
ed9b544e 2352}
2353
6e469882 2354/* Call() is the core of Redis execution of a command */
2355static void call(redisClient *c, struct redisCommand *cmd) {
2356 long long dirty;
2357
2358 dirty = server.dirty;
2359 cmd->proc(c);
4005fef1 2360 dirty = server.dirty-dirty;
2361
2362 if (server.appendonly && dirty)
6e469882 2363 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2364 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2365 listLength(server.slaves))
248ea310 2366 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2367 if (listLength(server.monitors))
dd142b9c 2368 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2369 server.stat_numcommands++;
2370}
2371
ed9b544e 2372/* If this function gets called we already read a whole
2373 * command, argments are in the client argv/argc fields.
2374 * processCommand() execute the command or prepare the
2375 * server for a bulk read from the client.
2376 *
2377 * If 1 is returned the client is still alive and valid and
2378 * and other operations can be performed by the caller. Otherwise
2379 * if 0 is returned the client was destroied (i.e. after QUIT). */
2380static int processCommand(redisClient *c) {
2381 struct redisCommand *cmd;
ed9b544e 2382
3fd78bcd 2383 /* Free some memory if needed (maxmemory setting) */
2384 if (server.maxmemory) freeMemoryIfNeeded();
2385
e8a74421 2386 /* Handle the multi bulk command type. This is an alternative protocol
2387 * supported by Redis in order to receive commands that are composed of
2388 * multiple binary-safe "bulk" arguments. The latency of processing is
2389 * a bit higher but this allows things like multi-sets, so if this
2390 * protocol is used only for MSET and similar commands this is a big win. */
2391 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2392 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2393 if (c->multibulk <= 0) {
2394 resetClient(c);
2395 return 1;
2396 } else {
2397 decrRefCount(c->argv[c->argc-1]);
2398 c->argc--;
2399 return 1;
2400 }
2401 } else if (c->multibulk) {
2402 if (c->bulklen == -1) {
2403 if (((char*)c->argv[0]->ptr)[0] != '$') {
2404 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2405 resetClient(c);
2406 return 1;
2407 } else {
2408 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2409 decrRefCount(c->argv[0]);
2410 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2411 c->argc--;
2412 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2413 resetClient(c);
2414 return 1;
2415 }
2416 c->argc--;
2417 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2418 return 1;
2419 }
2420 } else {
2421 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2422 c->mbargv[c->mbargc] = c->argv[0];
2423 c->mbargc++;
2424 c->argc--;
2425 c->multibulk--;
2426 if (c->multibulk == 0) {
2427 robj **auxargv;
2428 int auxargc;
2429
2430 /* Here we need to swap the multi-bulk argc/argv with the
2431 * normal argc/argv of the client structure. */
2432 auxargv = c->argv;
2433 c->argv = c->mbargv;
2434 c->mbargv = auxargv;
2435
2436 auxargc = c->argc;
2437 c->argc = c->mbargc;
2438 c->mbargc = auxargc;
2439
2440 /* We need to set bulklen to something different than -1
2441 * in order for the code below to process the command without
2442 * to try to read the last argument of a bulk command as
2443 * a special argument. */
2444 c->bulklen = 0;
2445 /* continue below and process the command */
2446 } else {
2447 c->bulklen = -1;
2448 return 1;
2449 }
2450 }
2451 }
2452 /* -- end of multi bulk commands processing -- */
2453
ed9b544e 2454 /* The QUIT command is handled as a special case. Normal command
2455 * procs are unable to close the client connection safely */
bb0b03a3 2456 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2457 freeClient(c);
2458 return 0;
2459 }
d5d55fc3 2460
2461 /* Now lookup the command and check ASAP about trivial error conditions
2462 * such wrong arity, bad command name and so forth. */
ed9b544e 2463 cmd = lookupCommand(c->argv[0]->ptr);
2464 if (!cmd) {
2c14807b 2465 addReplySds(c,
2466 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2467 (char*)c->argv[0]->ptr));
ed9b544e 2468 resetClient(c);
2469 return 1;
2470 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2471 (c->argc < -cmd->arity)) {
454d4e43 2472 addReplySds(c,
2473 sdscatprintf(sdsempty(),
2474 "-ERR wrong number of arguments for '%s' command\r\n",
2475 cmd->name));
ed9b544e 2476 resetClient(c);
2477 return 1;
2478 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2479 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2480 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2481
2482 decrRefCount(c->argv[c->argc-1]);
2483 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2484 c->argc--;
2485 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2486 resetClient(c);
2487 return 1;
2488 }
2489 c->argc--;
2490 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2491 /* It is possible that the bulk read is already in the
8d0490e7 2492 * buffer. Check this condition and handle it accordingly.
2493 * This is just a fast path, alternative to call processInputBuffer().
2494 * It's a good idea since the code is small and this condition
2495 * happens most of the times. */
ed9b544e 2496 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2497 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2498 c->argc++;
2499 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2500 } else {
d5d55fc3 2501 /* Otherwise return... there is to read the last argument
2502 * from the socket. */
ed9b544e 2503 return 1;
2504 }
2505 }
942a3961 2506 /* Let's try to encode the bulk object to save space. */
2507 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2508 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2509
e63943a4 2510 /* Check if the user is authenticated */
2511 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2512 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2513 resetClient(c);
2514 return 1;
2515 }
2516
b61a28fe 2517 /* Handle the maxmemory directive */
2518 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2519 zmalloc_used_memory() > server.maxmemory)
2520 {
2521 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2522 resetClient(c);
2523 return 1;
2524 }
2525
d6cc8867 2526 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2527 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2528 &&
ffc6b7f8 2529 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2530 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2531 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2532 resetClient(c);
2533 return 1;
2534 }
2535
ed9b544e 2536 /* Exec the command */
6531c94d 2537 if (c->flags & REDIS_MULTI &&
2538 cmd->proc != execCommand && cmd->proc != discardCommand &&
2539 cmd->proc != multiCommand && cmd->proc != watchCommand)
2540 {
6e469882 2541 queueMultiCommand(c,cmd);
2542 addReply(c,shared.queued);
2543 } else {
d5d55fc3 2544 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2545 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2546 call(c,cmd);
2547 }
ed9b544e 2548
2549 /* Prepare the client for the next command */
ed9b544e 2550 resetClient(c);
2551 return 1;
2552}
2553
248ea310 2554static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2555 listNode *ln;
c7df85a4 2556 listIter li;
ed9b544e 2557 int outc = 0, j;
93ea3759 2558 robj **outv;
248ea310 2559 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2560 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2561 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2562 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2563 robj *lenobj;
93ea3759 2564
2565 if (argc <= REDIS_STATIC_ARGS) {
2566 outv = static_outv;
2567 } else {
248ea310 2568 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2569 }
248ea310 2570
2571 lenobj = createObject(REDIS_STRING,
2572 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2573 lenobj->refcount = 0;
2574 outv[outc++] = lenobj;
ed9b544e 2575 for (j = 0; j < argc; j++) {
248ea310 2576 lenobj = createObject(REDIS_STRING,
2577 sdscatprintf(sdsempty(),"$%lu\r\n",
2578 (unsigned long) stringObjectLen(argv[j])));
2579 lenobj->refcount = 0;
2580 outv[outc++] = lenobj;
ed9b544e 2581 outv[outc++] = argv[j];
248ea310 2582 outv[outc++] = shared.crlf;
ed9b544e 2583 }
ed9b544e 2584
40d224a9 2585 /* Increment all the refcounts at start and decrement at end in order to
2586 * be sure to free objects if there is no slave in a replication state
2587 * able to be feed with commands */
2588 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2589 listRewind(slaves,&li);
2590 while((ln = listNext(&li))) {
ed9b544e 2591 redisClient *slave = ln->value;
40d224a9 2592
2593 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2594 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2595
2596 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2597 if (slave->slaveseldb != dictid) {
2598 robj *selectcmd;
2599
2600 switch(dictid) {
2601 case 0: selectcmd = shared.select0; break;
2602 case 1: selectcmd = shared.select1; break;
2603 case 2: selectcmd = shared.select2; break;
2604 case 3: selectcmd = shared.select3; break;
2605 case 4: selectcmd = shared.select4; break;
2606 case 5: selectcmd = shared.select5; break;
2607 case 6: selectcmd = shared.select6; break;
2608 case 7: selectcmd = shared.select7; break;
2609 case 8: selectcmd = shared.select8; break;
2610 case 9: selectcmd = shared.select9; break;
2611 default:
2612 selectcmd = createObject(REDIS_STRING,
2613 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2614 selectcmd->refcount = 0;
2615 break;
2616 }
2617 addReply(slave,selectcmd);
2618 slave->slaveseldb = dictid;
2619 }
2620 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2621 }
40d224a9 2622 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2623 if (outv != static_outv) zfree(outv);
ed9b544e 2624}
2625
dd142b9c 2626static sds sdscatrepr(sds s, char *p, size_t len) {
2627 s = sdscatlen(s,"\"",1);
2628 while(len--) {
2629 switch(*p) {
2630 case '\\':
2631 case '"':
2632 s = sdscatprintf(s,"\\%c",*p);
2633 break;
2634 case '\n': s = sdscatlen(s,"\\n",1); break;
2635 case '\r': s = sdscatlen(s,"\\r",1); break;
2636 case '\t': s = sdscatlen(s,"\\t",1); break;
2637 case '\a': s = sdscatlen(s,"\\a",1); break;
2638 case '\b': s = sdscatlen(s,"\\b",1); break;
2639 default:
2640 if (isprint(*p))
2641 s = sdscatprintf(s,"%c",*p);
2642 else
2643 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2644 break;
2645 }
2646 p++;
2647 }
2648 return sdscatlen(s,"\"",1);
2649}
2650
2651static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2652 listNode *ln;
2653 listIter li;
2654 int j;
2655 sds cmdrepr = sdsnew("+");
2656 robj *cmdobj;
2657 struct timeval tv;
2658
2659 gettimeofday(&tv,NULL);
2660 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2661 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2662
2663 for (j = 0; j < argc; j++) {
2664 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2665 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2666 } else {
2667 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2668 sdslen(argv[j]->ptr));
2669 }
2670 if (j != argc-1)
2671 cmdrepr = sdscatlen(cmdrepr," ",1);
2672 }
2673 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2674 cmdobj = createObject(REDIS_STRING,cmdrepr);
2675
2676 listRewind(monitors,&li);
2677 while((ln = listNext(&li))) {
2678 redisClient *monitor = ln->value;
2679 addReply(monitor,cmdobj);
2680 }
2681 decrRefCount(cmdobj);
2682}
2683
638e42ac 2684static void processInputBuffer(redisClient *c) {
ed9b544e 2685again:
4409877e 2686 /* Before to process the input buffer, make sure the client is not
2687 * waitig for a blocking operation such as BLPOP. Note that the first
2688 * iteration the client is never blocked, otherwise the processInputBuffer
2689 * would not be called at all, but after the execution of the first commands
2690 * in the input buffer the client may be blocked, and the "goto again"
2691 * will try to reiterate. The following line will make it return asap. */
92f8e882 2692 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2693 if (c->bulklen == -1) {
2694 /* Read the first line of the query */
2695 char *p = strchr(c->querybuf,'\n');
2696 size_t querylen;
644fafa3 2697
ed9b544e 2698 if (p) {
2699 sds query, *argv;
2700 int argc, j;
e0a62c7f 2701
ed9b544e 2702 query = c->querybuf;
2703 c->querybuf = sdsempty();
2704 querylen = 1+(p-(query));
2705 if (sdslen(query) > querylen) {
2706 /* leave data after the first line of the query in the buffer */
2707 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2708 }
2709 *p = '\0'; /* remove "\n" */
2710 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2711 sdsupdatelen(query);
2712
2713 /* Now we can split the query in arguments */
ed9b544e 2714 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2715 sdsfree(query);
2716
2717 if (c->argv) zfree(c->argv);
2718 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2719
2720 for (j = 0; j < argc; j++) {
ed9b544e 2721 if (sdslen(argv[j])) {
2722 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2723 c->argc++;
2724 } else {
2725 sdsfree(argv[j]);
2726 }
2727 }
2728 zfree(argv);
7c49733c 2729 if (c->argc) {
2730 /* Execute the command. If the client is still valid
2731 * after processCommand() return and there is something
2732 * on the query buffer try to process the next command. */
2733 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2734 } else {
2735 /* Nothing to process, argc == 0. Just process the query
2736 * buffer if it's not empty or return to the caller */
2737 if (sdslen(c->querybuf)) goto again;
2738 }
ed9b544e 2739 return;
644fafa3 2740 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2741 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2742 freeClient(c);
2743 return;
2744 }
2745 } else {
2746 /* Bulk read handling. Note that if we are at this point
2747 the client already sent a command terminated with a newline,
2748 we are reading the bulk data that is actually the last
2749 argument of the command. */
2750 int qbl = sdslen(c->querybuf);
2751
2752 if (c->bulklen <= qbl) {
2753 /* Copy everything but the final CRLF as final argument */
2754 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2755 c->argc++;
2756 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2757 /* Process the command. If the client is still valid after
2758 * the processing and there is more data in the buffer
2759 * try to parse it. */
2760 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2761 return;
2762 }
2763 }
2764}
2765
638e42ac 2766static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2767 redisClient *c = (redisClient*) privdata;
2768 char buf[REDIS_IOBUF_LEN];
2769 int nread;
2770 REDIS_NOTUSED(el);
2771 REDIS_NOTUSED(mask);
2772
2773 nread = read(fd, buf, REDIS_IOBUF_LEN);
2774 if (nread == -1) {
2775 if (errno == EAGAIN) {
2776 nread = 0;
2777 } else {
f870935d 2778 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2779 freeClient(c);
2780 return;
2781 }
2782 } else if (nread == 0) {
f870935d 2783 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2784 freeClient(c);
2785 return;
2786 }
2787 if (nread) {
2788 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2789 c->lastinteraction = time(NULL);
2790 } else {
2791 return;
2792 }
168ac5c6 2793 processInputBuffer(c);
638e42ac 2794}
2795
ed9b544e 2796static int selectDb(redisClient *c, int id) {
2797 if (id < 0 || id >= server.dbnum)
2798 return REDIS_ERR;
3305306f 2799 c->db = &server.db[id];
ed9b544e 2800 return REDIS_OK;
2801}
2802
40d224a9 2803static void *dupClientReplyValue(void *o) {
2804 incrRefCount((robj*)o);
12d090d2 2805 return o;
40d224a9 2806}
2807
ffc6b7f8 2808static int listMatchObjects(void *a, void *b) {
bf028098 2809 return equalStringObjects(a,b);
ffc6b7f8 2810}
2811
ed9b544e 2812static redisClient *createClient(int fd) {
2813 redisClient *c = zmalloc(sizeof(*c));
2814
2815 anetNonBlock(NULL,fd);
2816 anetTcpNoDelay(NULL,fd);
2817 if (!c) return NULL;
2818 selectDb(c,0);
2819 c->fd = fd;
2820 c->querybuf = sdsempty();
2821 c->argc = 0;
93ea3759 2822 c->argv = NULL;
ed9b544e 2823 c->bulklen = -1;
e8a74421 2824 c->multibulk = 0;
2825 c->mbargc = 0;
2826 c->mbargv = NULL;
ed9b544e 2827 c->sentlen = 0;
2828 c->flags = 0;
2829 c->lastinteraction = time(NULL);
abcb223e 2830 c->authenticated = 0;
40d224a9 2831 c->replstate = REDIS_REPL_NONE;
6b47e12e 2832 c->reply = listCreate();
ed9b544e 2833 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2834 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2835 c->blocking_keys = NULL;
2836 c->blocking_keys_num = 0;
92f8e882 2837 c->io_keys = listCreate();
87c68815 2838 c->watched_keys = listCreate();
92f8e882 2839 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2840 c->pubsub_channels = dictCreate(&setDictType,NULL);
2841 c->pubsub_patterns = listCreate();
2842 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2843 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2844 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2845 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2846 freeClient(c);
2847 return NULL;
2848 }
6b47e12e 2849 listAddNodeTail(server.clients,c);
6e469882 2850 initClientMultiState(c);
ed9b544e 2851 return c;
2852}
2853
2854static void addReply(redisClient *c, robj *obj) {
2855 if (listLength(c->reply) == 0 &&
6208b3a7 2856 (c->replstate == REDIS_REPL_NONE ||
2857 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2858 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2859 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2860
2861 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2862 obj = dupStringObject(obj);
2863 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2864 }
9d65a1bb 2865 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2866}
2867
2868static void addReplySds(redisClient *c, sds s) {
2869 robj *o = createObject(REDIS_STRING,s);
2870 addReply(c,o);
2871 decrRefCount(o);
2872}
2873
e2665397 2874static void addReplyDouble(redisClient *c, double d) {
2875 char buf[128];
2876
2877 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2878 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2879 (unsigned long) strlen(buf),buf));
e2665397 2880}
2881
aa7c2934
PN
2882static void addReplyLongLong(redisClient *c, long long ll) {
2883 char buf[128];
2884 size_t len;
2885
2886 if (ll == 0) {
2887 addReply(c,shared.czero);
2888 return;
2889 } else if (ll == 1) {
2890 addReply(c,shared.cone);
2891 return;
2892 }
482b672d 2893 buf[0] = ':';
2894 len = ll2string(buf+1,sizeof(buf)-1,ll);
2895 buf[len+1] = '\r';
2896 buf[len+2] = '\n';
2897 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2898}
2899
92b27fe9 2900static void addReplyUlong(redisClient *c, unsigned long ul) {
2901 char buf[128];
2902 size_t len;
2903
dd88747b 2904 if (ul == 0) {
2905 addReply(c,shared.czero);
2906 return;
2907 } else if (ul == 1) {
2908 addReply(c,shared.cone);
2909 return;
2910 }
92b27fe9 2911 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2912 addReplySds(c,sdsnewlen(buf,len));
2913}
2914
942a3961 2915static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2916 size_t len, intlen;
2917 char buf[128];
942a3961 2918
2919 if (obj->encoding == REDIS_ENCODING_RAW) {
2920 len = sdslen(obj->ptr);
2921 } else {
2922 long n = (long)obj->ptr;
2923
e054afda 2924 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2925 len = 1;
2926 if (n < 0) {
2927 len++;
2928 n = -n;
2929 }
2930 while((n = n/10) != 0) {
2931 len++;
2932 }
2933 }
482b672d 2934 buf[0] = '$';
2935 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2936 buf[intlen+1] = '\r';
2937 buf[intlen+2] = '\n';
2938 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2939}
2940
dd88747b 2941static void addReplyBulk(redisClient *c, robj *obj) {
2942 addReplyBulkLen(c,obj);
2943 addReply(c,obj);
2944 addReply(c,shared.crlf);
2945}
2946
09241813 2947static void addReplyBulkSds(redisClient *c, sds s) {
2948 robj *o = createStringObject(s, sdslen(s));
2949 addReplyBulk(c,o);
2950 decrRefCount(o);
2951}
2952
500ece7c 2953/* In the CONFIG command we need to add vanilla C string as bulk replies */
2954static void addReplyBulkCString(redisClient *c, char *s) {
2955 if (s == NULL) {
2956 addReply(c,shared.nullbulk);
2957 } else {
2958 robj *o = createStringObject(s,strlen(s));
2959 addReplyBulk(c,o);
2960 decrRefCount(o);
2961 }
2962}
2963
ed9b544e 2964static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2965 int cport, cfd;
2966 char cip[128];
285add55 2967 redisClient *c;
ed9b544e 2968 REDIS_NOTUSED(el);
2969 REDIS_NOTUSED(mask);
2970 REDIS_NOTUSED(privdata);
2971
2972 cfd = anetAccept(server.neterr, fd, cip, &cport);
2973 if (cfd == AE_ERR) {
f870935d 2974 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2975 return;
2976 }
f870935d 2977 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2978 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2979 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2980 close(cfd); /* May be already closed, just ingore errors */
2981 return;
2982 }
285add55 2983 /* If maxclient directive is set and this is one client more... close the
2984 * connection. Note that we create the client instead to check before
2985 * for this condition, since now the socket is already set in nonblocking
2986 * mode and we can send an error for free using the Kernel I/O */
2987 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2988 char *err = "-ERR max number of clients reached\r\n";
2989
2990 /* That's a best effort error message, don't check write errors */
fee803ba 2991 if (write(c->fd,err,strlen(err)) == -1) {
2992 /* Nothing to do, Just to avoid the warning... */
2993 }
285add55 2994 freeClient(c);
2995 return;
2996 }
ed9b544e 2997 server.stat_numconnections++;
2998}
2999
3000/* ======================= Redis objects implementation ===================== */
3001
3002static robj *createObject(int type, void *ptr) {
3003 robj *o;
3004
a5819310 3005 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3006 if (listLength(server.objfreelist)) {
3007 listNode *head = listFirst(server.objfreelist);
3008 o = listNodeValue(head);
3009 listDelNode(server.objfreelist,head);
a5819310 3010 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3011 } else {
560db612 3012 if (server.vm_enabled)
a5819310 3013 pthread_mutex_unlock(&server.obj_freelist_mutex);
560db612 3014 o = zmalloc(sizeof(*o));
ed9b544e 3015 }
ed9b544e 3016 o->type = type;
942a3961 3017 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 3018 o->ptr = ptr;
3019 o->refcount = 1;
3a66edc7 3020 if (server.vm_enabled) {
1064ef87 3021 /* Note that this code may run in the context of an I/O thread
560db612 3022 * and accessing server.lruclock in theory is an error
1064ef87 3023 * (no locks). But in practice this is safe, and even if we read
560db612 3024 * garbage Redis will not fail. */
3025 o->lru = server.lruclock;
3a66edc7 3026 o->storage = REDIS_VM_MEMORY;
3027 }
ed9b544e 3028 return o;
3029}
3030
3031static robj *createStringObject(char *ptr, size_t len) {
3032 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3033}
3034
3f973463
PN
3035static robj *createStringObjectFromLongLong(long long value) {
3036 robj *o;
3037 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3038 incrRefCount(shared.integers[value]);
3039 o = shared.integers[value];
3040 } else {
3f973463 3041 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 3042 o = createObject(REDIS_STRING, NULL);
3f973463
PN
3043 o->encoding = REDIS_ENCODING_INT;
3044 o->ptr = (void*)((long)value);
3045 } else {
ee14da56 3046 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
3047 }
3048 }
3049 return o;
3050}
3051
4ef8de8a 3052static robj *dupStringObject(robj *o) {
b9bc0eef 3053 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 3054 return createStringObject(o->ptr,sdslen(o->ptr));
3055}
3056
ed9b544e 3057static robj *createListObject(void) {
3058 list *l = listCreate();
1cd92e7f 3059 robj *o = createObject(REDIS_LIST,l);
ed9b544e 3060 listSetFreeMethod(l,decrRefCount);
1cd92e7f
PN
3061 o->encoding = REDIS_ENCODING_LIST;
3062 return o;
3063}
3064
3065static robj *createZiplistObject(void) {
3066 unsigned char *zl = ziplistNew();
3067 robj *o = createObject(REDIS_LIST,zl);
3068 o->encoding = REDIS_ENCODING_ZIPLIST;
3069 return o;
ed9b544e 3070}
3071
3072static robj *createSetObject(void) {
3073 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 3074 return createObject(REDIS_SET,d);
3075}
3076
5234952b 3077static robj *createHashObject(void) {
3078 /* All the Hashes start as zipmaps. Will be automatically converted
3079 * into hash tables if there are enough elements or big elements
3080 * inside. */
3081 unsigned char *zm = zipmapNew();
3082 robj *o = createObject(REDIS_HASH,zm);
3083 o->encoding = REDIS_ENCODING_ZIPMAP;
3084 return o;
3085}
3086
1812e024 3087static robj *createZsetObject(void) {
6b47e12e 3088 zset *zs = zmalloc(sizeof(*zs));
3089
3090 zs->dict = dictCreate(&zsetDictType,NULL);
3091 zs->zsl = zslCreate();
3092 return createObject(REDIS_ZSET,zs);
1812e024 3093}
3094
ed9b544e 3095static void freeStringObject(robj *o) {
942a3961 3096 if (o->encoding == REDIS_ENCODING_RAW) {
3097 sdsfree(o->ptr);
3098 }
ed9b544e 3099}
3100
3101static void freeListObject(robj *o) {
c7d9d662
PN
3102 switch (o->encoding) {
3103 case REDIS_ENCODING_LIST:
3104 listRelease((list*) o->ptr);
3105 break;
3106 case REDIS_ENCODING_ZIPLIST:
3107 zfree(o->ptr);
3108 break;
3109 default:
3110 redisPanic("Unknown list encoding type");
3111 }
ed9b544e 3112}
3113
3114static void freeSetObject(robj *o) {
3115 dictRelease((dict*) o->ptr);
3116}
3117
fd8ccf44 3118static void freeZsetObject(robj *o) {
3119 zset *zs = o->ptr;
3120
3121 dictRelease(zs->dict);
3122 zslFree(zs->zsl);
3123 zfree(zs);
3124}
3125
ed9b544e 3126static void freeHashObject(robj *o) {
cbba7dd7 3127 switch (o->encoding) {
3128 case REDIS_ENCODING_HT:
3129 dictRelease((dict*) o->ptr);
3130 break;
3131 case REDIS_ENCODING_ZIPMAP:
3132 zfree(o->ptr);
3133 break;
3134 default:
f83c6cb5 3135 redisPanic("Unknown hash encoding type");
cbba7dd7 3136 break;
3137 }
ed9b544e 3138}
3139
3140static void incrRefCount(robj *o) {
3141 o->refcount++;
3142}
3143
3144static void decrRefCount(void *obj) {
3145 robj *o = obj;
94754ccc 3146
560db612 3147 /* Object is a swapped out value, or in the process of being loaded. */
996cb5f7 3148 if (server.vm_enabled &&
3149 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3150 {
560db612 3151 vmpointer *vp = obj;
3152 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3153 vmMarkPagesFree(vp->page,vp->usedpages);
7d98e08c 3154 server.vm_stats_swapped_objects--;
560db612 3155 zfree(vp);
a35ddf12 3156 return;
3157 }
560db612 3158
3159 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
e4ed181d 3160 /* Object is in memory, or in the process of being swapped out.
3161 *
3162 * If the object is being swapped out, abort the operation on
3163 * decrRefCount even if the refcount does not drop to 0: the object
3164 * is referenced at least two times, as value of the key AND as
3165 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3166 * done but the relevant key was removed in the meantime, the
3167 * complete jobs handler will not find the key about the job and the
3168 * assert will fail. */
3169 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3170 vmCancelThreadedIOJob(o);
ed9b544e 3171 if (--(o->refcount) == 0) {
3172 switch(o->type) {
3173 case REDIS_STRING: freeStringObject(o); break;
3174 case REDIS_LIST: freeListObject(o); break;
3175 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3176 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3177 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3178 default: redisPanic("Unknown object type"); break;
ed9b544e 3179 }
a5819310 3180 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3181 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3182 !listAddNodeHead(server.objfreelist,o))
3183 zfree(o);
a5819310 3184 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3185 }
3186}
3187
92b27fe9 3188static int checkType(redisClient *c, robj *o, int type) {
3189 if (o->type != type) {
3190 addReply(c,shared.wrongtypeerr);
3191 return 1;
3192 }
3193 return 0;
3194}
3195
724a51b1 3196/* Check if the nul-terminated string 's' can be represented by a long
3197 * (that is, is a number that fits into long without any other space or
3198 * character before or after the digits).
3199 *
3200 * If so, the function returns REDIS_OK and *longval is set to the value
3201 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3202static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3203 char buf[32], *endptr;
3204 long value;
3205 int slen;
e0a62c7f 3206
724a51b1 3207 value = strtol(s, &endptr, 10);
3208 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3209 slen = ll2string(buf,32,value);
724a51b1 3210
3211 /* If the number converted back into a string is not identical
3212 * then it's not possible to encode the string as integer */
f69f2cba 3213 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3214 if (longval) *longval = value;
3215 return REDIS_OK;
3216}
3217
942a3961 3218/* Try to encode a string object in order to save space */
05df7621 3219static robj *tryObjectEncoding(robj *o) {
942a3961 3220 long value;
942a3961 3221 sds s = o->ptr;
3305306f 3222
942a3961 3223 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3224 return o; /* Already encoded */
3305306f 3225
05df7621 3226 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3227 * everywhere in the "object space" of Redis. Encoded objects can only
3228 * appear as "values" (and not, for instance, as keys) */
05df7621 3229 if (o->refcount > 1) return o;
3305306f 3230
942a3961 3231 /* Currently we try to encode only strings */
dfc5e96c 3232 redisAssert(o->type == REDIS_STRING);
94754ccc 3233
724a51b1 3234 /* Check if we can represent this string as a long integer */
05df7621 3235 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3236
3237 /* Ok, this object can be encoded */
05df7621 3238 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3239 decrRefCount(o);
3240 incrRefCount(shared.integers[value]);
3241 return shared.integers[value];
3242 } else {
3243 o->encoding = REDIS_ENCODING_INT;
3244 sdsfree(o->ptr);
3245 o->ptr = (void*) value;
3246 return o;
3247 }
942a3961 3248}
3249
9d65a1bb 3250/* Get a decoded version of an encoded object (returned as a new object).
3251 * If the object is already raw-encoded just increment the ref count. */
3252static robj *getDecodedObject(robj *o) {
942a3961 3253 robj *dec;
e0a62c7f 3254
9d65a1bb 3255 if (o->encoding == REDIS_ENCODING_RAW) {
3256 incrRefCount(o);
3257 return o;
3258 }
942a3961 3259 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3260 char buf[32];
3261
ee14da56 3262 ll2string(buf,32,(long)o->ptr);
942a3961 3263 dec = createStringObject(buf,strlen(buf));
3264 return dec;
3265 } else {
08ee9b57 3266 redisPanic("Unknown encoding type");
942a3961 3267 }
3305306f 3268}
3269
d7f43c08 3270/* Compare two string objects via strcmp() or alike.
3271 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3272 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3273 * and compare the strings, it's much faster than calling getDecodedObject().
3274 *
3275 * Important note: if objects are not integer encoded, but binary-safe strings,
3276 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3277 * binary safe. */
724a51b1 3278static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3279 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3280 char bufa[128], bufb[128], *astr, *bstr;
3281 int bothsds = 1;
724a51b1 3282
e197b441 3283 if (a == b) return 0;
d7f43c08 3284 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3285 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3286 astr = bufa;
3287 bothsds = 0;
724a51b1 3288 } else {
d7f43c08 3289 astr = a->ptr;
724a51b1 3290 }
d7f43c08 3291 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3292 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3293 bstr = bufb;
3294 bothsds = 0;
3295 } else {
3296 bstr = b->ptr;
3297 }
3298 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3299}
3300
bf028098 3301/* Equal string objects return 1 if the two objects are the same from the
3302 * point of view of a string comparison, otherwise 0 is returned. Note that
3303 * this function is faster then checking for (compareStringObject(a,b) == 0)
3304 * because it can perform some more optimization. */
3305static int equalStringObjects(robj *a, robj *b) {
3306 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3307 return a->ptr == b->ptr;
3308 } else {
3309 return compareStringObjects(a,b) == 0;
3310 }
3311}
3312
0ea663ea 3313static size_t stringObjectLen(robj *o) {
dfc5e96c 3314 redisAssert(o->type == REDIS_STRING);
0ea663ea 3315 if (o->encoding == REDIS_ENCODING_RAW) {
3316 return sdslen(o->ptr);
3317 } else {
3318 char buf[32];
3319
ee14da56 3320 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3321 }
3322}
3323
bd79a6bd
PN
3324static int getDoubleFromObject(robj *o, double *target) {
3325 double value;
682c73e8 3326 char *eptr;
bbe025e0 3327
bd79a6bd
PN
3328 if (o == NULL) {
3329 value = 0;
3330 } else {
3331 redisAssert(o->type == REDIS_STRING);
3332 if (o->encoding == REDIS_ENCODING_RAW) {
3333 value = strtod(o->ptr, &eptr);
682c73e8 3334 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3335 } else if (o->encoding == REDIS_ENCODING_INT) {
3336 value = (long)o->ptr;
3337 } else {
946342c1 3338 redisPanic("Unknown string encoding");
bd79a6bd
PN
3339 }
3340 }
3341
bd79a6bd
PN
3342 *target = value;
3343 return REDIS_OK;
3344}
bbe025e0 3345
bd79a6bd
PN
3346static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3347 double value;
3348 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3349 if (msg != NULL) {
3350 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3351 } else {
3352 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3353 }
bbe025e0
AM
3354 return REDIS_ERR;
3355 }
3356
bd79a6bd 3357 *target = value;
bbe025e0
AM
3358 return REDIS_OK;
3359}
3360
bd79a6bd
PN
3361static int getLongLongFromObject(robj *o, long long *target) {
3362 long long value;
682c73e8 3363 char *eptr;
bbe025e0 3364
bd79a6bd
PN
3365 if (o == NULL) {
3366 value = 0;
3367 } else {
3368 redisAssert(o->type == REDIS_STRING);
3369 if (o->encoding == REDIS_ENCODING_RAW) {
3370 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3371 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3372 } else if (o->encoding == REDIS_ENCODING_INT) {
3373 value = (long)o->ptr;
3374 } else {
946342c1 3375 redisPanic("Unknown string encoding");
bd79a6bd
PN
3376 }
3377 }
3378
bd79a6bd
PN
3379 *target = value;
3380 return REDIS_OK;
3381}
bbe025e0 3382
bd79a6bd
PN
3383static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3384 long long value;
3385 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3386 if (msg != NULL) {
3387 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3388 } else {
3389 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3390 }
bbe025e0
AM
3391 return REDIS_ERR;
3392 }
3393
bd79a6bd 3394 *target = value;
bbe025e0
AM
3395 return REDIS_OK;
3396}
3397
bd79a6bd
PN
3398static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3399 long long value;
bbe025e0 3400
bd79a6bd
PN
3401 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3402 if (value < LONG_MIN || value > LONG_MAX) {
3403 if (msg != NULL) {
3404 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3405 } else {
3406 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3407 }
bbe025e0
AM
3408 return REDIS_ERR;
3409 }
3410
bd79a6bd 3411 *target = value;
bbe025e0
AM
3412 return REDIS_OK;
3413}
3414
612e4de8 3415/* =========================== Keyspace access API ========================== */
3416
3417static robj *lookupKey(redisDb *db, robj *key) {
09241813 3418 dictEntry *de = dictFind(db->dict,key->ptr);
612e4de8 3419 if (de) {
612e4de8 3420 robj *val = dictGetEntryVal(de);
3421
3422 if (server.vm_enabled) {
3423 if (val->storage == REDIS_VM_MEMORY ||
3424 val->storage == REDIS_VM_SWAPPING)
3425 {
3426 /* If we were swapping the object out, cancel the operation */
3427 if (val->storage == REDIS_VM_SWAPPING)
3428 vmCancelThreadedIOJob(val);
09241813 3429 /* Update the access time for the aging algorithm. */
612e4de8 3430 val->lru = server.lruclock;
3431 } else {
3432 int notify = (val->storage == REDIS_VM_LOADING);
3433
3434 /* Our value was swapped on disk. Bring it at home. */
3435 redisAssert(val->type == REDIS_VMPOINTER);
3436 val = vmLoadObject(val);
3437 dictGetEntryVal(de) = val;
3438
3439 /* Clients blocked by the VM subsystem may be waiting for
3440 * this key... */
3441 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3442 }
3443 }
3444 return val;
3445 } else {
3446 return NULL;
3447 }
3448}
3449
3450static robj *lookupKeyRead(redisDb *db, robj *key) {
3451 expireIfNeeded(db,key);
3452 return lookupKey(db,key);
3453}
3454
3455static robj *lookupKeyWrite(redisDb *db, robj *key) {
3456 deleteIfVolatile(db,key);
3457 touchWatchedKey(db,key);
3458 return lookupKey(db,key);
3459}
3460
3461static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3462 robj *o = lookupKeyRead(c->db, key);
3463 if (!o) addReply(c,reply);
3464 return o;
3465}
3466
3467static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3468 robj *o = lookupKeyWrite(c->db, key);
3469 if (!o) addReply(c,reply);
3470 return o;
3471}
3472
09241813 3473/* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3474 * otherwise REDIS_OK is returned, and the caller should increment the
3475 * refcount of 'val'. */
3476static int dbAdd(redisDb *db, robj *key, robj *val) {
3477 /* Perform a lookup before adding the key, as we need to copy the
3478 * key value. */
3479 if (dictFind(db->dict, key->ptr) != NULL) {
3480 return REDIS_ERR;
3481 } else {
3482 sds copy = sdsdup(key->ptr);
3483 dictAdd(db->dict, copy, val);
3484 return REDIS_OK;
3485 }
3486}
3487
3488/* If the key does not exist, this is just like dbAdd(). Otherwise
3489 * the value associated to the key is replaced with the new one.
3490 *
3491 * On update (key already existed) 0 is returned. Otherwise 1. */
3492static int dbReplace(redisDb *db, robj *key, robj *val) {
3493 if (dictFind(db->dict,key->ptr) == NULL) {
3494 sds copy = sdsdup(key->ptr);
3495 dictAdd(db->dict, copy, val);
3496 return 1;
3497 } else {
3498 dictReplace(db->dict, key->ptr, val);
3499 return 0;
3500 }
3501}
3502
3503static int dbExists(redisDb *db, robj *key) {
3504 return dictFind(db->dict,key->ptr) != NULL;
3505}
3506
3507/* Return a random key, in form of a Redis object.
3508 * If there are no keys, NULL is returned.
3509 *
3510 * The function makes sure to return keys not already expired. */
3511static robj *dbRandomKey(redisDb *db) {
3512 struct dictEntry *de;
3513
3514 while(1) {
3515 sds key;
3516 robj *keyobj;
3517
3518 de = dictGetRandomKey(db->dict);
3519 if (de == NULL) return NULL;
3520
3521 key = dictGetEntryKey(de);
3522 keyobj = createStringObject(key,sdslen(key));
3523 if (dictFind(db->expires,key)) {
3524 if (expireIfNeeded(db,keyobj)) {
3525 decrRefCount(keyobj);
3526 continue; /* search for another key. This expired. */
3527 }
3528 }
3529 return keyobj;
3530 }
3531}
3532
3533/* Delete a key, value, and associated expiration entry if any, from the DB */
3534static int dbDelete(redisDb *db, robj *key) {
829137b9
PN
3535 /* Deleting an entry from the expires dict will not free the sds of
3536 * the key, because it is shared with the main dictionary. */
3537 if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr);
3538 return dictDelete(db->dict,key->ptr) == DICT_OK;
612e4de8 3539}
3540
06233c45 3541/*============================ RDB saving/loading =========================== */
ed9b544e 3542
f78fd11b 3543static int rdbSaveType(FILE *fp, unsigned char type) {
3544 if (fwrite(&type,1,1,fp) == 0) return -1;
3545 return 0;
3546}
3547
bb32ede5 3548static int rdbSaveTime(FILE *fp, time_t t) {
3549 int32_t t32 = (int32_t) t;
3550 if (fwrite(&t32,4,1,fp) == 0) return -1;
3551 return 0;
3552}
3553
e3566d4b 3554/* check rdbLoadLen() comments for more info */
f78fd11b 3555static int rdbSaveLen(FILE *fp, uint32_t len) {
3556 unsigned char buf[2];
3557
3558 if (len < (1<<6)) {
3559 /* Save a 6 bit len */
10c43610 3560 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3561 if (fwrite(buf,1,1,fp) == 0) return -1;
3562 } else if (len < (1<<14)) {
3563 /* Save a 14 bit len */
10c43610 3564 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3565 buf[1] = len&0xFF;
17be1a4a 3566 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3567 } else {
3568 /* Save a 32 bit len */
10c43610 3569 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3570 if (fwrite(buf,1,1,fp) == 0) return -1;
3571 len = htonl(len);
3572 if (fwrite(&len,4,1,fp) == 0) return -1;
3573 }
3574 return 0;
3575}
3576
32a66513 3577/* Encode 'value' as an integer if possible (if integer will fit the
3578 * supported range). If the function sucessful encoded the integer
3579 * then the (up to 5 bytes) encoded representation is written in the
3580 * string pointed by 'enc' and the length is returned. Otherwise
3581 * 0 is returned. */
3582static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3583 /* Finally check if it fits in our ranges */
3584 if (value >= -(1<<7) && value <= (1<<7)-1) {
3585 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3586 enc[1] = value&0xFF;
3587 return 2;
3588 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3589 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3590 enc[1] = value&0xFF;
3591 enc[2] = (value>>8)&0xFF;
3592 return 3;
3593 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3594 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3595 enc[1] = value&0xFF;
3596 enc[2] = (value>>8)&0xFF;
3597 enc[3] = (value>>16)&0xFF;
3598 enc[4] = (value>>24)&0xFF;
3599 return 5;
3600 } else {
3601 return 0;
3602 }
3603}
3604
32a66513 3605/* String objects in the form "2391" "-100" without any space and with a
3606 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3607 * encoded as integers to save space */
3608static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3609 long long value;
3610 char *endptr, buf[32];
3611
3612 /* Check if it's possible to encode this value as a number */
3613 value = strtoll(s, &endptr, 10);
3614 if (endptr[0] != '\0') return 0;
3615 ll2string(buf,32,value);
3616
3617 /* If the number converted back into a string is not identical
3618 * then it's not possible to encode the string as integer */
3619 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3620
3621 return rdbEncodeInteger(value,enc);
3622}
3623
b1befe6a 3624static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3625 size_t comprlen, outlen;
774e3047 3626 unsigned char byte;
3627 void *out;
3628
3629 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3630 if (len <= 4) return 0;
3631 outlen = len-4;
3a2694c4 3632 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3633 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3634 if (comprlen == 0) {
88e85998 3635 zfree(out);
774e3047 3636 return 0;
3637 }
3638 /* Data compressed! Let's save it on disk */
3639 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3640 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3641 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3642 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3643 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3644 zfree(out);
774e3047 3645 return comprlen;
3646
3647writeerr:
88e85998 3648 zfree(out);
774e3047 3649 return -1;
3650}
3651
e3566d4b 3652/* Save a string objet as [len][data] on disk. If the object is a string
3653 * representation of an integer value we try to safe it in a special form */
b1befe6a 3654static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3655 int enclen;
10c43610 3656
774e3047 3657 /* Try integer encoding */
e3566d4b 3658 if (len <= 11) {
3659 unsigned char buf[5];
b1befe6a 3660 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3661 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3662 return 0;
3663 }
3664 }
774e3047 3665
3666 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3667 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3668 if (server.rdbcompression && len > 20) {
774e3047 3669 int retval;
3670
b1befe6a 3671 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3672 if (retval == -1) return -1;
3673 if (retval > 0) return 0;
3674 /* retval == 0 means data can't be compressed, save the old way */
3675 }
3676
3677 /* Store verbatim */
10c43610 3678 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3679 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3680 return 0;
3681}
3682
2796f6da
PN
3683/* Save a long long value as either an encoded string or a string. */
3684static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3685 unsigned char buf[32];
3686 int enclen = rdbEncodeInteger(value,buf);
3687 if (enclen > 0) {
3688 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3689 } else {
3690 /* Encode as string */
3691 enclen = ll2string((char*)buf,32,value);
3692 redisAssert(enclen < 32);
3693 if (rdbSaveLen(fp,enclen) == -1) return -1;
3694 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3695 }
3696 return 0;
3697}
3698
942a3961 3699/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3700static int rdbSaveStringObject(FILE *fp, robj *obj) {
32a66513 3701 /* Avoid to decode the object, then encode it again, if the
3702 * object is alrady integer encoded. */
3703 if (obj->encoding == REDIS_ENCODING_INT) {
2796f6da 3704 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
996cb5f7 3705 } else {
2796f6da
PN
3706 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3707 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3708 }
942a3961 3709}
3710
a7866db6 3711/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3712 * 8 bit integer specifing the length of the representation.
3713 * This 8 bit integer has special values in order to specify the following
3714 * conditions:
3715 * 253: not a number
3716 * 254: + inf
3717 * 255: - inf
3718 */
3719static int rdbSaveDoubleValue(FILE *fp, double val) {
3720 unsigned char buf[128];
3721 int len;
3722
3723 if (isnan(val)) {
3724 buf[0] = 253;
3725 len = 1;
3726 } else if (!isfinite(val)) {
3727 len = 1;
3728 buf[0] = (val < 0) ? 255 : 254;
3729 } else {
88e8d89f 3730#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3731 /* Check if the float is in a safe range to be casted into a
3732 * long long. We are assuming that long long is 64 bit here.
3733 * Also we are assuming that there are no implementations around where
3734 * double has precision < 52 bit.
3735 *
3736 * Under this assumptions we test if a double is inside an interval
3737 * where casting to long long is safe. Then using two castings we
3738 * make sure the decimal part is zero. If all this is true we use
3739 * integer printing function that is much faster. */
fb82e75c 3740 double min = -4503599627370495; /* (2^52)-1 */
3741 double max = 4503599627370496; /* -(2^52) */
fe244589 3742 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3743 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3744 else
88e8d89f 3745#endif
8c096b16 3746 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3747 buf[0] = strlen((char*)buf+1);
a7866db6 3748 len = buf[0]+1;
3749 }
3750 if (fwrite(buf,len,1,fp) == 0) return -1;
3751 return 0;
3752}
3753
06233c45 3754/* Save a Redis object. */
3755static int rdbSaveObject(FILE *fp, robj *o) {
3756 if (o->type == REDIS_STRING) {
3757 /* Save a string value */
3758 if (rdbSaveStringObject(fp,o) == -1) return -1;
3759 } else if (o->type == REDIS_LIST) {
3760 /* Save a list value */
23f96494
PN
3761 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3762 unsigned char *p;
3763 unsigned char *vstr;
3764 unsigned int vlen;
3765 long long vlong;
3766
3767 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3768 p = ziplistIndex(o->ptr,0);
3769 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3770 if (vstr) {
3771 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3772 return -1;
3773 } else {
3774 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3775 return -1;
3776 }
3777 p = ziplistNext(o->ptr,p);
3778 }
3779 } else if (o->encoding == REDIS_ENCODING_LIST) {
3780 list *list = o->ptr;
3781 listIter li;
3782 listNode *ln;
3783
3784 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3785 listRewind(list,&li);
3786 while((ln = listNext(&li))) {
3787 robj *eleobj = listNodeValue(ln);
3788 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3789 }
3790 } else {
3791 redisPanic("Unknown list encoding");
06233c45 3792 }
3793 } else if (o->type == REDIS_SET) {
3794 /* Save a set value */
3795 dict *set = o->ptr;
3796 dictIterator *di = dictGetIterator(set);
3797 dictEntry *de;
3798
3799 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3800 while((de = dictNext(di)) != NULL) {
3801 robj *eleobj = dictGetEntryKey(de);
3802
3803 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3804 }
3805 dictReleaseIterator(di);
3806 } else if (o->type == REDIS_ZSET) {
3807 /* Save a set value */
3808 zset *zs = o->ptr;
3809 dictIterator *di = dictGetIterator(zs->dict);
3810 dictEntry *de;
3811
3812 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3813 while((de = dictNext(di)) != NULL) {
3814 robj *eleobj = dictGetEntryKey(de);
3815 double *score = dictGetEntryVal(de);
3816
3817 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3818 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3819 }
3820 dictReleaseIterator(di);
b1befe6a 3821 } else if (o->type == REDIS_HASH) {
3822 /* Save a hash value */
3823 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3824 unsigned char *p = zipmapRewind(o->ptr);
3825 unsigned int count = zipmapLen(o->ptr);
3826 unsigned char *key, *val;
3827 unsigned int klen, vlen;
3828
3829 if (rdbSaveLen(fp,count) == -1) return -1;
3830 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3831 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3832 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3833 }
3834 } else {
3835 dictIterator *di = dictGetIterator(o->ptr);
3836 dictEntry *de;
3837
3838 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3839 while((de = dictNext(di)) != NULL) {
3840 robj *key = dictGetEntryKey(de);
3841 robj *val = dictGetEntryVal(de);
3842
3843 if (rdbSaveStringObject(fp,key) == -1) return -1;
3844 if (rdbSaveStringObject(fp,val) == -1) return -1;
3845 }
3846 dictReleaseIterator(di);
3847 }
06233c45 3848 } else {
f83c6cb5 3849 redisPanic("Unknown object type");
06233c45 3850 }
3851 return 0;
3852}
3853
3854/* Return the length the object will have on disk if saved with
3855 * the rdbSaveObject() function. Currently we use a trick to get
3856 * this length with very little changes to the code. In the future
3857 * we could switch to a faster solution. */
b9bc0eef 3858static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3859 if (fp == NULL) fp = server.devnull;
06233c45 3860 rewind(fp);
3861 assert(rdbSaveObject(fp,o) != 1);
3862 return ftello(fp);
3863}
3864
06224fec 3865/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3866static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3867 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3868
06224fec 3869 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3870}
3871
ed9b544e 3872/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3873static int rdbSave(char *filename) {
ed9b544e 3874 dictIterator *di = NULL;
3875 dictEntry *de;
ed9b544e 3876 FILE *fp;
3877 char tmpfile[256];
3878 int j;
bb32ede5 3879 time_t now = time(NULL);
ed9b544e 3880
2316bb3b 3881 /* Wait for I/O therads to terminate, just in case this is a
3882 * foreground-saving, to avoid seeking the swap file descriptor at the
3883 * same time. */
3884 if (server.vm_enabled)
3885 waitEmptyIOJobsQueue();
3886
a3b21203 3887 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3888 fp = fopen(tmpfile,"w");
3889 if (!fp) {
3890 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3891 return REDIS_ERR;
3892 }
f78fd11b 3893 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3894 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3895 redisDb *db = server.db+j;
3896 dict *d = db->dict;
3305306f 3897 if (dictSize(d) == 0) continue;
ed9b544e 3898 di = dictGetIterator(d);
3899 if (!di) {
3900 fclose(fp);
3901 return REDIS_ERR;
3902 }
3903
3904 /* Write the SELECT DB opcode */
f78fd11b 3905 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3906 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3907
3908 /* Iterate this DB writing every entry */
3909 while((de = dictNext(di)) != NULL) {
09241813 3910 sds keystr = dictGetEntryKey(de);
3911 robj key, *o = dictGetEntryVal(de);
3912 time_t expiretime;
3913
3914 initStaticStringObject(key,keystr);
3915 expiretime = getExpire(db,&key);
bb32ede5 3916
3917 /* Save the expire time */
3918 if (expiretime != -1) {
3919 /* If this key is already expired skip it */
3920 if (expiretime < now) continue;
3921 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3922 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3923 }
7e69548d 3924 /* Save the key and associated value. This requires special
3925 * handling if the value is swapped out. */
560db612 3926 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3927 o->storage == REDIS_VM_SWAPPING) {
7e69548d 3928 /* Save type, key, value */
3929 if (rdbSaveType(fp,o->type) == -1) goto werr;
09241813 3930 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
7e69548d 3931 if (rdbSaveObject(fp,o) == -1) goto werr;
3932 } else {
996cb5f7 3933 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3934 robj *po;
7e69548d 3935 /* Get a preview of the object in memory */
560db612 3936 po = vmPreviewObject(o);
7e69548d 3937 /* Save type, key, value */
560db612 3938 if (rdbSaveType(fp,po->type) == -1) goto werr;
09241813 3939 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
7e69548d 3940 if (rdbSaveObject(fp,po) == -1) goto werr;
3941 /* Remove the loaded object from memory */
3942 decrRefCount(po);
7e69548d 3943 }
ed9b544e 3944 }
3945 dictReleaseIterator(di);
3946 }
3947 /* EOF opcode */
f78fd11b 3948 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3949
3950 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3951 fflush(fp);
3952 fsync(fileno(fp));
3953 fclose(fp);
e0a62c7f 3954
ed9b544e 3955 /* Use RENAME to make sure the DB file is changed atomically only
3956 * if the generate DB file is ok. */
3957 if (rename(tmpfile,filename) == -1) {
325d1eb4 3958 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3959 unlink(tmpfile);
3960 return REDIS_ERR;
3961 }
3962 redisLog(REDIS_NOTICE,"DB saved on disk");
3963 server.dirty = 0;
3964 server.lastsave = time(NULL);
3965 return REDIS_OK;
3966
3967werr:
3968 fclose(fp);
3969 unlink(tmpfile);
3970 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3971 if (di) dictReleaseIterator(di);
3972 return REDIS_ERR;
3973}
3974
f78fd11b 3975static int rdbSaveBackground(char *filename) {
ed9b544e 3976 pid_t childpid;
3977
9d65a1bb 3978 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3979 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3980 if ((childpid = fork()) == 0) {
3981 /* Child */
054e426d 3982 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3983 close(server.fd);
f78fd11b 3984 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3985 _exit(0);
ed9b544e 3986 } else {
478c2c6f 3987 _exit(1);
ed9b544e 3988 }
3989 } else {
3990 /* Parent */
5a7c647e 3991 if (childpid == -1) {
3992 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3993 strerror(errno));
3994 return REDIS_ERR;
3995 }
ed9b544e 3996 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3997 server.bgsavechildpid = childpid;
884d4b39 3998 updateDictResizePolicy();
ed9b544e 3999 return REDIS_OK;
4000 }
4001 return REDIS_OK; /* unreached */
4002}
4003
a3b21203 4004static void rdbRemoveTempFile(pid_t childpid) {
4005 char tmpfile[256];
4006
4007 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4008 unlink(tmpfile);
4009}
4010
f78fd11b 4011static int rdbLoadType(FILE *fp) {
4012 unsigned char type;
7b45bfb2 4013 if (fread(&type,1,1,fp) == 0) return -1;
4014 return type;
4015}
4016
bb32ede5 4017static time_t rdbLoadTime(FILE *fp) {
4018 int32_t t32;
4019 if (fread(&t32,4,1,fp) == 0) return -1;
4020 return (time_t) t32;
4021}
4022
e3566d4b 4023/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4024 * of this file for a description of how this are stored on disk.
4025 *
4026 * isencoded is set to 1 if the readed length is not actually a length but
4027 * an "encoding type", check the above comments for more info */
c78a8ccc 4028static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 4029 unsigned char buf[2];
4030 uint32_t len;
c78a8ccc 4031 int type;
f78fd11b 4032
e3566d4b 4033 if (isencoded) *isencoded = 0;
c78a8ccc 4034 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4035 type = (buf[0]&0xC0)>>6;
4036 if (type == REDIS_RDB_6BITLEN) {
4037 /* Read a 6 bit len */
4038 return buf[0]&0x3F;
4039 } else if (type == REDIS_RDB_ENCVAL) {
4040 /* Read a 6 bit len encoding type */
4041 if (isencoded) *isencoded = 1;
4042 return buf[0]&0x3F;
4043 } else if (type == REDIS_RDB_14BITLEN) {
4044 /* Read a 14 bit len */
4045 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4046 return ((buf[0]&0x3F)<<8)|buf[1];
4047 } else {
4048 /* Read a 32 bit len */
f78fd11b 4049 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4050 return ntohl(len);
f78fd11b 4051 }
f78fd11b 4052}
4053
ad30aa60 4054/* Load an integer-encoded object from file 'fp', with the specified
4055 * encoding type 'enctype'. If encode is true the function may return
4056 * an integer-encoded object as reply, otherwise the returned object
4057 * will always be encoded as a raw string. */
4058static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 4059 unsigned char enc[4];
4060 long long val;
4061
4062 if (enctype == REDIS_RDB_ENC_INT8) {
4063 if (fread(enc,1,1,fp) == 0) return NULL;
4064 val = (signed char)enc[0];
4065 } else if (enctype == REDIS_RDB_ENC_INT16) {
4066 uint16_t v;
4067 if (fread(enc,2,1,fp) == 0) return NULL;
4068 v = enc[0]|(enc[1]<<8);
4069 val = (int16_t)v;
4070 } else if (enctype == REDIS_RDB_ENC_INT32) {
4071 uint32_t v;
4072 if (fread(enc,4,1,fp) == 0) return NULL;
4073 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4074 val = (int32_t)v;
4075 } else {
4076 val = 0; /* anti-warning */
f83c6cb5 4077 redisPanic("Unknown RDB integer encoding type");
e3566d4b 4078 }
ad30aa60 4079 if (encode)
4080 return createStringObjectFromLongLong(val);
4081 else
4082 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 4083}
4084
c78a8ccc 4085static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 4086 unsigned int len, clen;
4087 unsigned char *c = NULL;
4088 sds val = NULL;
4089
c78a8ccc 4090 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4091 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 4092 if ((c = zmalloc(clen)) == NULL) goto err;
4093 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4094 if (fread(c,clen,1,fp) == 0) goto err;
4095 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 4096 zfree(c);
88e85998 4097 return createObject(REDIS_STRING,val);
4098err:
4099 zfree(c);
4100 sdsfree(val);
4101 return NULL;
4102}
4103
ad30aa60 4104static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 4105 int isencoded;
4106 uint32_t len;
f78fd11b 4107 sds val;
4108
c78a8ccc 4109 len = rdbLoadLen(fp,&isencoded);
e3566d4b 4110 if (isencoded) {
4111 switch(len) {
4112 case REDIS_RDB_ENC_INT8:
4113 case REDIS_RDB_ENC_INT16:
4114 case REDIS_RDB_ENC_INT32:
ad30aa60 4115 return rdbLoadIntegerObject(fp,len,encode);
88e85998 4116 case REDIS_RDB_ENC_LZF:
bdcb92f2 4117 return rdbLoadLzfStringObject(fp);
e3566d4b 4118 default:
f83c6cb5 4119 redisPanic("Unknown RDB encoding type");
e3566d4b 4120 }
4121 }
4122
f78fd11b 4123 if (len == REDIS_RDB_LENERR) return NULL;
4124 val = sdsnewlen(NULL,len);
4125 if (len && fread(val,len,1,fp) == 0) {
4126 sdsfree(val);
4127 return NULL;
4128 }
bdcb92f2 4129 return createObject(REDIS_STRING,val);
f78fd11b 4130}
4131
ad30aa60 4132static robj *rdbLoadStringObject(FILE *fp) {
4133 return rdbGenericLoadStringObject(fp,0);
4134}
4135
4136static robj *rdbLoadEncodedStringObject(FILE *fp) {
4137 return rdbGenericLoadStringObject(fp,1);
4138}
4139
a7866db6 4140/* For information about double serialization check rdbSaveDoubleValue() */
4141static int rdbLoadDoubleValue(FILE *fp, double *val) {
4142 char buf[128];
4143 unsigned char len;
4144
4145 if (fread(&len,1,1,fp) == 0) return -1;
4146 switch(len) {
4147 case 255: *val = R_NegInf; return 0;
4148 case 254: *val = R_PosInf; return 0;
4149 case 253: *val = R_Nan; return 0;
4150 default:
4151 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 4152 buf[len] = '\0';
a7866db6 4153 sscanf(buf, "%lg", val);
4154 return 0;
4155 }
4156}
4157
c78a8ccc 4158/* Load a Redis object of the specified type from the specified file.
4159 * On success a newly allocated object is returned, otherwise NULL. */
4160static robj *rdbLoadObject(int type, FILE *fp) {
23f96494
PN
4161 robj *o, *ele, *dec;
4162 size_t len;
c78a8ccc 4163
bcd11906 4164 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 4165 if (type == REDIS_STRING) {
4166 /* Read string value */
ad30aa60 4167 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4168 o = tryObjectEncoding(o);
23f96494
PN
4169 } else if (type == REDIS_LIST) {
4170 /* Read list value */
4171 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4172
d0686e07
PN
4173 /* Use a real list when there are too many entries */
4174 if (len > server.list_max_ziplist_entries) {
4175 o = createListObject();
4176 } else {
4177 o = createZiplistObject();
4178 }
c78a8ccc 4179
23f96494
PN
4180 /* Load every single element of the list */
4181 while(len--) {
4182 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4183
d0686e07
PN
4184 /* If we are using a ziplist and the value is too big, convert
4185 * the object to a real list. */
4186 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4187 ele->encoding == REDIS_ENCODING_RAW &&
4188 sdslen(ele->ptr) > server.list_max_ziplist_value)
003f0840 4189 listTypeConvert(o,REDIS_ENCODING_LIST);
d0686e07 4190
23f96494
PN
4191 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4192 dec = getDecodedObject(ele);
4193 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4194 decrRefCount(dec);
4195 decrRefCount(ele);
4196 } else {
4197 ele = tryObjectEncoding(ele);
4198 listAddNodeTail(o->ptr,ele);
23f96494
PN
4199 }
4200 }
4201 } else if (type == REDIS_SET) {
4202 /* Read list/set value */
4203 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4204 o = createSetObject();
3c68de9b 4205 /* It's faster to expand the dict to the right size asap in order
4206 * to avoid rehashing */
23f96494
PN
4207 if (len > DICT_HT_INITIAL_SIZE)
4208 dictExpand(o->ptr,len);
c78a8ccc 4209 /* Load every single element of the list/set */
23f96494 4210 while(len--) {
ad30aa60 4211 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4212 ele = tryObjectEncoding(ele);
23f96494 4213 dictAdd((dict*)o->ptr,ele,NULL);
c78a8ccc 4214 }
4215 } else if (type == REDIS_ZSET) {
4216 /* Read list/set value */
ada386b2 4217 size_t zsetlen;
c78a8ccc 4218 zset *zs;
4219
4220 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4221 o = createZsetObject();
4222 zs = o->ptr;
4223 /* Load every single element of the list/set */
4224 while(zsetlen--) {
4225 robj *ele;
4226 double *score = zmalloc(sizeof(double));
4227
ad30aa60 4228 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4229 ele = tryObjectEncoding(ele);
c78a8ccc 4230 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4231 dictAdd(zs->dict,ele,score);
4232 zslInsert(zs->zsl,*score,ele);
4233 incrRefCount(ele); /* added to skiplist */
4234 }
ada386b2 4235 } else if (type == REDIS_HASH) {
4236 size_t hashlen;
4237
4238 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4239 o = createHashObject();
4240 /* Too many entries? Use an hash table. */
4241 if (hashlen > server.hash_max_zipmap_entries)
4242 convertToRealHash(o);
4243 /* Load every key/value, then set it into the zipmap or hash
4244 * table, as needed. */
4245 while(hashlen--) {
4246 robj *key, *val;
4247
b785b2bf 4248 if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4249 if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
ada386b2 4250 /* If we are using a zipmap and there are too big values
4251 * the object is converted to real hash table encoding. */
4252 if (o->encoding != REDIS_ENCODING_HT &&
4253 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4254 sdslen(val->ptr) > server.hash_max_zipmap_value))
4255 {
4256 convertToRealHash(o);
4257 }
4258
4259 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4260 unsigned char *zm = o->ptr;
4261
4262 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4263 val->ptr,sdslen(val->ptr),NULL);
4264 o->ptr = zm;
4265 decrRefCount(key);
4266 decrRefCount(val);
4267 } else {
05df7621 4268 key = tryObjectEncoding(key);
4269 val = tryObjectEncoding(val);
ada386b2 4270 dictAdd((dict*)o->ptr,key,val);
ada386b2 4271 }
4272 }
c78a8ccc 4273 } else {
f83c6cb5 4274 redisPanic("Unknown object type");
c78a8ccc 4275 }
4276 return o;
4277}
4278
f78fd11b 4279static int rdbLoad(char *filename) {
ed9b544e 4280 FILE *fp;
f78fd11b 4281 uint32_t dbid;
bb32ede5 4282 int type, retval, rdbver;
585af7e2 4283 int swap_all_values = 0;
bb32ede5 4284 redisDb *db = server.db+0;
f78fd11b 4285 char buf[1024];
242a64f3 4286 time_t expiretime, now = time(NULL);
bb32ede5 4287
ed9b544e 4288 fp = fopen(filename,"r");
4289 if (!fp) return REDIS_ERR;
4290 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4291 buf[9] = '\0';
4292 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4293 fclose(fp);
4294 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4295 return REDIS_ERR;
4296 }
f78fd11b 4297 rdbver = atoi(buf+5);
c78a8ccc 4298 if (rdbver != 1) {
f78fd11b 4299 fclose(fp);
4300 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4301 return REDIS_ERR;
4302 }
ed9b544e 4303 while(1) {
585af7e2 4304 robj *key, *val;
7e02fe32 4305 int force_swapout;
ed9b544e 4306
585af7e2 4307 expiretime = -1;
ed9b544e 4308 /* Read type. */
f78fd11b 4309 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4310 if (type == REDIS_EXPIRETIME) {
4311 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4312 /* We read the time so we need to read the object type again */
4313 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4314 }
ed9b544e 4315 if (type == REDIS_EOF) break;
4316 /* Handle SELECT DB opcode as a special case */
4317 if (type == REDIS_SELECTDB) {
c78a8ccc 4318 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4319 goto eoferr;
ed9b544e 4320 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4321 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4322 exit(1);
4323 }
bb32ede5 4324 db = server.db+dbid;
ed9b544e 4325 continue;
4326 }
4327 /* Read key */
585af7e2 4328 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4329 /* Read value */
585af7e2 4330 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4331 /* Check if the key already expired */
4332 if (expiretime != -1 && expiretime < now) {
4333 decrRefCount(key);
4334 decrRefCount(val);
4335 continue;
4336 }
ed9b544e 4337 /* Add the new object in the hash table */
09241813 4338 retval = dbAdd(db,key,val);
4339 if (retval == REDIS_ERR) {
585af7e2 4340 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4341 exit(1);
4342 }
bb32ede5 4343 /* Set the expire time if needed */
89e689c5 4344 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4345
b492cf00 4346 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4347
4348 /* If we detecter we are hopeless about fitting something in memory
4349 * we just swap every new key on disk. Directly...
4350 * Note that's important to check for this condition before resorting
4351 * to random sampling, otherwise we may try to swap already
4352 * swapped keys. */
585af7e2 4353 if (swap_all_values) {
09241813 4354 dictEntry *de = dictFind(db->dict,key->ptr);
242a64f3 4355
4356 /* de may be NULL since the key already expired */
4357 if (de) {
560db612 4358 vmpointer *vp;
585af7e2 4359 val = dictGetEntryVal(de);
242a64f3 4360
560db612 4361 if (val->refcount == 1 &&
4362 (vp = vmSwapObjectBlocking(val)) != NULL)
4363 dictGetEntryVal(de) = vp;
242a64f3 4364 }
09241813 4365 decrRefCount(key);
242a64f3 4366 continue;
4367 }
09241813 4368 decrRefCount(key);
242a64f3 4369
a89b7013 4370 /* Flush data on disk once 32 MB of additional RAM are used... */
7e02fe32 4371 force_swapout = 0;
4372 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4373 force_swapout = 1;
242a64f3 4374
4375 /* If we have still some hope of having some value fitting memory
4376 * then we try random sampling. */
7e02fe32 4377 if (!swap_all_values && server.vm_enabled && force_swapout) {
b492cf00 4378 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4379 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4380 }
242a64f3 4381 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4382 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4383 }
ed9b544e 4384 }
4385 fclose(fp);
4386 return REDIS_OK;
4387
4388eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4389 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4390 exit(1);
4391 return REDIS_ERR; /* Just to avoid warning */
4392}
4393
b58ba105 4394/*================================== Shutdown =============================== */
fab43727 4395static int prepareForShutdown() {
b58ba105
AM
4396 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4397 /* Kill the saving child if there is a background saving in progress.
4398 We want to avoid race conditions, for instance our saving child may
4399 overwrite the synchronous saving did by SHUTDOWN. */
4400 if (server.bgsavechildpid != -1) {
4401 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4402 kill(server.bgsavechildpid,SIGKILL);
4403 rdbRemoveTempFile(server.bgsavechildpid);
4404 }
4405 if (server.appendonly) {
4406 /* Append only file: fsync() the AOF and exit */
b0bd87f6 4407 aof_fsync(server.appendfd);
b58ba105 4408 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4409 } else {
4410 /* Snapshotting. Perform a SYNC SAVE and exit */
4411 if (rdbSave(server.dbfilename) == REDIS_OK) {
4412 if (server.daemonize)
4413 unlink(server.pidfile);
4414 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4415 } else {
4416 /* Ooops.. error saving! The best we can do is to continue
4417 * operating. Note that if there was a background saving process,
4418 * in the next cron() Redis will be notified that the background
4419 * saving aborted, handling special stuff like slaves pending for
4420 * synchronization... */
4421 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4422 return REDIS_ERR;
b58ba105
AM
4423 }
4424 }
8513a757 4425 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4426 return REDIS_OK;
b58ba105
AM
4427}
4428
ed9b544e 4429/*================================== Commands =============================== */
4430
abcb223e 4431static void authCommand(redisClient *c) {
2e77c2ee 4432 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4433 c->authenticated = 1;
4434 addReply(c,shared.ok);
4435 } else {
4436 c->authenticated = 0;
fa4c0aba 4437 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4438 }
4439}
4440
ed9b544e 4441static void pingCommand(redisClient *c) {
4442 addReply(c,shared.pong);
4443}
4444
4445static void echoCommand(redisClient *c) {
dd88747b 4446 addReplyBulk(c,c->argv[1]);
ed9b544e 4447}
4448
4449/*=================================== Strings =============================== */
4450
526d00a5 4451static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4452 int retval;
10ce1276 4453 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4454
526d00a5 4455 if (expire) {
4456 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4457 return;
4458 if (seconds <= 0) {
4459 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4460 return;
4461 }
4462 }
4463
37ab76c9 4464 touchWatchedKey(c->db,key);
526d00a5 4465 if (nx) deleteIfVolatile(c->db,key);
09241813 4466 retval = dbAdd(c->db,key,val);
4467 if (retval == REDIS_ERR) {
ed9b544e 4468 if (!nx) {
09241813 4469 dbReplace(c->db,key,val);
526d00a5 4470 incrRefCount(val);
ed9b544e 4471 } else {
c937aa89 4472 addReply(c,shared.czero);
ed9b544e 4473 return;
4474 }
4475 } else {
526d00a5 4476 incrRefCount(val);
ed9b544e 4477 }
4478 server.dirty++;
526d00a5 4479 removeExpire(c->db,key);
4480 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4481 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4482}
4483
4484static void setCommand(redisClient *c) {
526d00a5 4485 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4486}
4487
4488static void setnxCommand(redisClient *c) {
526d00a5 4489 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4490}
4491
4492static void setexCommand(redisClient *c) {
4493 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4494}
4495
322fc7d8 4496static int getGenericCommand(redisClient *c) {
dd88747b 4497 robj *o;
e0a62c7f 4498
dd88747b 4499 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4500 return REDIS_OK;
dd88747b 4501
4502 if (o->type != REDIS_STRING) {
4503 addReply(c,shared.wrongtypeerr);
4504 return REDIS_ERR;
ed9b544e 4505 } else {
dd88747b 4506 addReplyBulk(c,o);
4507 return REDIS_OK;
ed9b544e 4508 }
4509}
4510
322fc7d8 4511static void getCommand(redisClient *c) {
4512 getGenericCommand(c);
4513}
4514
f6b141c5 4515static void getsetCommand(redisClient *c) {
322fc7d8 4516 if (getGenericCommand(c) == REDIS_ERR) return;
09241813 4517 dbReplace(c->db,c->argv[1],c->argv[2]);
a431eb74 4518 incrRefCount(c->argv[2]);
4519 server.dirty++;
4520 removeExpire(c->db,c->argv[1]);
4521}
4522
70003d28 4523static void mgetCommand(redisClient *c) {
70003d28 4524 int j;
e0a62c7f 4525
c937aa89 4526 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4527 for (j = 1; j < c->argc; j++) {
3305306f 4528 robj *o = lookupKeyRead(c->db,c->argv[j]);
4529 if (o == NULL) {
c937aa89 4530 addReply(c,shared.nullbulk);
70003d28 4531 } else {
70003d28 4532 if (o->type != REDIS_STRING) {
c937aa89 4533 addReply(c,shared.nullbulk);
70003d28 4534 } else {
dd88747b 4535 addReplyBulk(c,o);
70003d28 4536 }
4537 }
4538 }
4539}
4540
6c446631 4541static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4542 int j, busykeys = 0;
6c446631 4543
4544 if ((c->argc % 2) == 0) {
454d4e43 4545 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4546 return;
4547 }
4548 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4549 * set nothing at all if at least one already key exists. */
4550 if (nx) {
4551 for (j = 1; j < c->argc; j += 2) {
906573e7 4552 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4553 busykeys++;
6c446631 4554 }
4555 }
4556 }
906573e7 4557 if (busykeys) {
4558 addReply(c, shared.czero);
4559 return;
4560 }
6c446631 4561
4562 for (j = 1; j < c->argc; j += 2) {
05df7621 4563 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
09241813 4564 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4565 incrRefCount(c->argv[j+1]);
6c446631 4566 removeExpire(c->db,c->argv[j]);
4567 }
4568 server.dirty += (c->argc-1)/2;
4569 addReply(c, nx ? shared.cone : shared.ok);
4570}
4571
4572static void msetCommand(redisClient *c) {
4573 msetGenericCommand(c,0);
4574}
4575
4576static void msetnxCommand(redisClient *c) {
4577 msetGenericCommand(c,1);
4578}
4579
d68ed120 4580static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4581 long long value;
ed9b544e 4582 robj *o;
e0a62c7f 4583
3305306f 4584 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4585 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4586 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4587
4588 value += incr;
d6f4c262 4589 o = createStringObjectFromLongLong(value);
09241813 4590 dbReplace(c->db,c->argv[1],o);
ed9b544e 4591 server.dirty++;
c937aa89 4592 addReply(c,shared.colon);
ed9b544e 4593 addReply(c,o);
4594 addReply(c,shared.crlf);
4595}
4596
4597static void incrCommand(redisClient *c) {
a4d1ba9a 4598 incrDecrCommand(c,1);
ed9b544e 4599}
4600
4601static void decrCommand(redisClient *c) {
a4d1ba9a 4602 incrDecrCommand(c,-1);
ed9b544e 4603}
4604
4605static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4606 long long incr;
4607
bd79a6bd 4608 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4609 incrDecrCommand(c,incr);
ed9b544e 4610}
4611
4612static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4613 long long incr;
4614
bd79a6bd 4615 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4616 incrDecrCommand(c,-incr);
ed9b544e 4617}
4618
4b00bebd 4619static void appendCommand(redisClient *c) {
4620 int retval;
4621 size_t totlen;
4622 robj *o;
4623
4624 o = lookupKeyWrite(c->db,c->argv[1]);
4625 if (o == NULL) {
4626 /* Create the key */
09241813 4627 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4b00bebd 4628 incrRefCount(c->argv[2]);
4629 totlen = stringObjectLen(c->argv[2]);
4630 } else {
4b00bebd 4631 if (o->type != REDIS_STRING) {
4632 addReply(c,shared.wrongtypeerr);
4633 return;
4634 }
4635 /* If the object is specially encoded or shared we have to make
4636 * a copy */
4637 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4638 robj *decoded = getDecodedObject(o);
4639
4640 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4641 decrRefCount(decoded);
09241813 4642 dbReplace(c->db,c->argv[1],o);
4b00bebd 4643 }
4644 /* APPEND! */
4645 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4646 o->ptr = sdscatlen(o->ptr,
4647 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4648 } else {
4649 o->ptr = sdscatprintf(o->ptr, "%ld",
4650 (unsigned long) c->argv[2]->ptr);
4651 }
4652 totlen = sdslen(o->ptr);
4653 }
4654 server.dirty++;
4655 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4656}
4657
39191553 4658static void substrCommand(redisClient *c) {
4659 robj *o;
4660 long start = atoi(c->argv[2]->ptr);
4661 long end = atoi(c->argv[3]->ptr);
dd88747b 4662 size_t rangelen, strlen;
4663 sds range;
39191553 4664
dd88747b 4665 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4666 checkType(c,o,REDIS_STRING)) return;
39191553 4667
dd88747b 4668 o = getDecodedObject(o);
4669 strlen = sdslen(o->ptr);
8fe7fad7 4670
dd88747b 4671 /* convert negative indexes */
4672 if (start < 0) start = strlen+start;
4673 if (end < 0) end = strlen+end;
4674 if (start < 0) start = 0;
4675 if (end < 0) end = 0;
39191553 4676
dd88747b 4677 /* indexes sanity checks */
4678 if (start > end || (size_t)start >= strlen) {
4679 /* Out of range start or start > end result in null reply */
4680 addReply(c,shared.nullbulk);
4681 decrRefCount(o);
4682 return;
39191553 4683 }
dd88747b 4684 if ((size_t)end >= strlen) end = strlen-1;
4685 rangelen = (end-start)+1;
4686
4687 /* Return the result */
4688 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4689 range = sdsnewlen((char*)o->ptr+start,rangelen);
4690 addReplySds(c,range);
4691 addReply(c,shared.crlf);
4692 decrRefCount(o);
39191553 4693}
4694
ed9b544e 4695/* ========================= Type agnostic commands ========================= */
4696
4697static void delCommand(redisClient *c) {
5109cdff 4698 int deleted = 0, j;
4699
4700 for (j = 1; j < c->argc; j++) {
09241813 4701 if (dbDelete(c->db,c->argv[j])) {
37ab76c9 4702 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4703 server.dirty++;
4704 deleted++;
4705 }
4706 }
482b672d 4707 addReplyLongLong(c,deleted);
ed9b544e 4708}
4709
4710static void existsCommand(redisClient *c) {
f4f06efc 4711 expireIfNeeded(c->db,c->argv[1]);
09241813 4712 if (dbExists(c->db,c->argv[1])) {
f4f06efc
PN
4713 addReply(c, shared.cone);
4714 } else {
4715 addReply(c, shared.czero);
4716 }
ed9b544e 4717}
4718
4719static void selectCommand(redisClient *c) {
4720 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4721
ed9b544e 4722 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4723 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4724 } else {
4725 addReply(c,shared.ok);
4726 }
4727}
4728
4729static void randomkeyCommand(redisClient *c) {
dc4be23e 4730 robj *key;
e0a62c7f 4731
09241813 4732 if ((key = dbRandomKey(c->db)) == NULL) {
dc4be23e 4733 addReply(c,shared.nullbulk);
4734 return;
4735 }
4736
09241813 4737 addReplyBulk(c,key);
4738 decrRefCount(key);
ed9b544e 4739}
4740
4741static void keysCommand(redisClient *c) {
4742 dictIterator *di;
4743 dictEntry *de;
4744 sds pattern = c->argv[1]->ptr;
4745 int plen = sdslen(pattern);
a3f9eec2 4746 unsigned long numkeys = 0;
ed9b544e 4747 robj *lenobj = createObject(REDIS_STRING,NULL);
4748
3305306f 4749 di = dictGetIterator(c->db->dict);
ed9b544e 4750 addReply(c,lenobj);
4751 decrRefCount(lenobj);
4752 while((de = dictNext(di)) != NULL) {
09241813 4753 sds key = dictGetEntryKey(de);
4754 robj *keyobj;
3305306f 4755
ed9b544e 4756 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4757 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
09241813 4758 keyobj = createStringObject(key,sdslen(key));
3305306f 4759 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4760 addReplyBulk(c,keyobj);
3305306f 4761 numkeys++;
3305306f 4762 }
09241813 4763 decrRefCount(keyobj);
ed9b544e 4764 }
4765 }
4766 dictReleaseIterator(di);
a3f9eec2 4767 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4768}
4769
4770static void dbsizeCommand(redisClient *c) {
4771 addReplySds(c,
3305306f 4772 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4773}
4774
4775static void lastsaveCommand(redisClient *c) {
4776 addReplySds(c,
c937aa89 4777 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4778}
4779
4780static void typeCommand(redisClient *c) {
3305306f 4781 robj *o;
ed9b544e 4782 char *type;
3305306f 4783
4784 o = lookupKeyRead(c->db,c->argv[1]);
4785 if (o == NULL) {
c937aa89 4786 type = "+none";
ed9b544e 4787 } else {
ed9b544e 4788 switch(o->type) {
c937aa89 4789 case REDIS_STRING: type = "+string"; break;
4790 case REDIS_LIST: type = "+list"; break;
4791 case REDIS_SET: type = "+set"; break;
412a8bce 4792 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4793 case REDIS_HASH: type = "+hash"; break;
4794 default: type = "+unknown"; break;
ed9b544e 4795 }
4796 }
4797 addReplySds(c,sdsnew(type));
4798 addReply(c,shared.crlf);
4799}
4800
4801static void saveCommand(redisClient *c) {
9d65a1bb 4802 if (server.bgsavechildpid != -1) {
05557f6d 4803 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4804 return;
4805 }
f78fd11b 4806 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4807 addReply(c,shared.ok);
4808 } else {
4809 addReply(c,shared.err);
4810 }
4811}
4812
4813static void bgsaveCommand(redisClient *c) {
9d65a1bb 4814 if (server.bgsavechildpid != -1) {
ed9b544e 4815 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4816 return;
4817 }
f78fd11b 4818 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4819 char *status = "+Background saving started\r\n";
4820 addReplySds(c,sdsnew(status));
ed9b544e 4821 } else {
4822 addReply(c,shared.err);
4823 }
4824}
4825
4826static void shutdownCommand(redisClient *c) {
fab43727 4827 if (prepareForShutdown() == REDIS_OK)
4828 exit(0);
4829 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4830}
4831
4832static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4833 robj *o;
4834
4835 /* To use the same key as src and dst is probably an error */
4836 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4837 addReply(c,shared.sameobjecterr);
ed9b544e 4838 return;
4839 }
4840
dd88747b 4841 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4842 return;
dd88747b 4843
ed9b544e 4844 incrRefCount(o);
3305306f 4845 deleteIfVolatile(c->db,c->argv[2]);
09241813 4846 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
ed9b544e 4847 if (nx) {
4848 decrRefCount(o);
c937aa89 4849 addReply(c,shared.czero);
ed9b544e 4850 return;
4851 }
09241813 4852 dbReplace(c->db,c->argv[2],o);
ed9b544e 4853 }
09241813 4854 dbDelete(c->db,c->argv[1]);
b167f877 4855 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4856 server.dirty++;
c937aa89 4857 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4858}
4859
4860static void renameCommand(redisClient *c) {
4861 renameGenericCommand(c,0);
4862}
4863
4864static void renamenxCommand(redisClient *c) {
4865 renameGenericCommand(c,1);
4866}
4867
4868static void moveCommand(redisClient *c) {
3305306f 4869 robj *o;
4870 redisDb *src, *dst;
ed9b544e 4871 int srcid;
4872
4873 /* Obtain source and target DB pointers */
3305306f 4874 src = c->db;
4875 srcid = c->db->id;
ed9b544e 4876 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4877 addReply(c,shared.outofrangeerr);
ed9b544e 4878 return;
4879 }
3305306f 4880 dst = c->db;
4881 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4882
4883 /* If the user is moving using as target the same
4884 * DB as the source DB it is probably an error. */
4885 if (src == dst) {
c937aa89 4886 addReply(c,shared.sameobjecterr);
ed9b544e 4887 return;
4888 }
4889
4890 /* Check if the element exists and get a reference */
3305306f 4891 o = lookupKeyWrite(c->db,c->argv[1]);
4892 if (!o) {
c937aa89 4893 addReply(c,shared.czero);
ed9b544e 4894 return;
4895 }
4896
4897 /* Try to add the element to the target DB */
3305306f 4898 deleteIfVolatile(dst,c->argv[1]);
09241813 4899 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
c937aa89 4900 addReply(c,shared.czero);
ed9b544e 4901 return;
4902 }
ed9b544e 4903 incrRefCount(o);
4904
4905 /* OK! key moved, free the entry in the source DB */
09241813 4906 dbDelete(src,c->argv[1]);
ed9b544e 4907 server.dirty++;
c937aa89 4908 addReply(c,shared.cone);
ed9b544e 4909}
4910
4911/* =================================== Lists ================================ */
d0686e07
PN
4912
4913
4914/* Check the argument length to see if it requires us to convert the ziplist
4915 * to a real list. Only check raw-encoded objects because integer encoded
4916 * objects are never too long. */
003f0840 4917static void listTypeTryConversion(robj *subject, robj *value) {
d0686e07
PN
4918 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4919 if (value->encoding == REDIS_ENCODING_RAW &&
4920 sdslen(value->ptr) > server.list_max_ziplist_value)
003f0840 4921 listTypeConvert(subject,REDIS_ENCODING_LIST);
d0686e07
PN
4922}
4923
003f0840 4924static void listTypePush(robj *subject, robj *value, int where) {
d0686e07 4925 /* Check if we need to convert the ziplist */
003f0840 4926 listTypeTryConversion(subject,value);
d0686e07 4927 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
70b4b320 4928 ziplistLen(subject->ptr) >= server.list_max_ziplist_entries)
003f0840 4929 listTypeConvert(subject,REDIS_ENCODING_LIST);
d0686e07 4930
c7d9d662
PN
4931 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4932 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4933 value = getDecodedObject(value);
4934 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4935 decrRefCount(value);
4936 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4937 if (where == REDIS_HEAD) {
4938 listAddNodeHead(subject->ptr,value);
4939 } else {
4940 listAddNodeTail(subject->ptr,value);
4941 }
4942 incrRefCount(value);
4943 } else {
4944 redisPanic("Unknown list encoding");
4945 }
4946}
4947
003f0840 4948static robj *listTypePop(robj *subject, int where) {
d72562f7
PN
4949 robj *value = NULL;
4950 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4951 unsigned char *p;
b6eb9703 4952 unsigned char *vstr;
d72562f7 4953 unsigned int vlen;
b6eb9703 4954 long long vlong;
d72562f7
PN
4955 int pos = (where == REDIS_HEAD) ? 0 : -1;
4956 p = ziplistIndex(subject->ptr,pos);
b6eb9703
PN
4957 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4958 if (vstr) {
4959 value = createStringObject((char*)vstr,vlen);
d72562f7 4960 } else {
b6eb9703 4961 value = createStringObjectFromLongLong(vlong);
d72562f7 4962 }
0f62e177
PN
4963 /* We only need to delete an element when it exists */
4964 subject->ptr = ziplistDelete(subject->ptr,&p);
d72562f7 4965 }
d72562f7
PN
4966 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4967 list *list = subject->ptr;
4968 listNode *ln;
4969 if (where == REDIS_HEAD) {
4970 ln = listFirst(list);
4971 } else {
4972 ln = listLast(list);
4973 }
4974 if (ln != NULL) {
4975 value = listNodeValue(ln);
4976 incrRefCount(value);
4977 listDelNode(list,ln);
4978 }
4979 } else {
4980 redisPanic("Unknown list encoding");
4981 }
4982 return value;
4983}
4984
003f0840 4985static unsigned long listTypeLength(robj *subject) {
d72562f7
PN
4986 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4987 return ziplistLen(subject->ptr);
4988 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4989 return listLength((list*)subject->ptr);
4990 } else {
4991 redisPanic("Unknown list encoding");
4992 }
4993}
4994
a6dd455b
PN
4995/* Structure to hold set iteration abstraction. */
4996typedef struct {
4997 robj *subject;
4998 unsigned char encoding;
be02a7c0 4999 unsigned char direction; /* Iteration direction */
a6dd455b
PN
5000 unsigned char *zi;
5001 listNode *ln;
003f0840 5002} listTypeIterator;
a6dd455b 5003
be02a7c0
PN
5004/* Structure for an entry while iterating over a list. */
5005typedef struct {
003f0840 5006 listTypeIterator *li;
be02a7c0
PN
5007 unsigned char *zi; /* Entry in ziplist */
5008 listNode *ln; /* Entry in linked list */
003f0840 5009} listTypeEntry;
be02a7c0 5010
a6dd455b 5011/* Initialize an iterator at the specified index. */
003f0840
PN
5012static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5013 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
a6dd455b
PN
5014 li->subject = subject;
5015 li->encoding = subject->encoding;
be02a7c0 5016 li->direction = direction;
a6dd455b
PN
5017 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5018 li->zi = ziplistIndex(subject->ptr,index);
5019 } else if (li->encoding == REDIS_ENCODING_LIST) {
5020 li->ln = listIndex(subject->ptr,index);
5021 } else {
5022 redisPanic("Unknown list encoding");
5023 }
5024 return li;
5025}
5026
5027/* Clean up the iterator. */
003f0840 5028static void listTypeReleaseIterator(listTypeIterator *li) {
a6dd455b
PN
5029 zfree(li);
5030}
5031
be02a7c0
PN
5032/* Stores pointer to current the entry in the provided entry structure
5033 * and advances the position of the iterator. Returns 1 when the current
5034 * entry is in fact an entry, 0 otherwise. */
003f0840 5035static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
dda20542
PN
5036 /* Protect from converting when iterating */
5037 redisAssert(li->subject->encoding == li->encoding);
5038
be02a7c0 5039 entry->li = li;
d2ee16ab 5040 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
5041 entry->zi = li->zi;
5042 if (entry->zi != NULL) {
5043 if (li->direction == REDIS_TAIL)
5044 li->zi = ziplistNext(li->subject->ptr,li->zi);
5045 else
5046 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5047 return 1;
5048 }
d2ee16ab 5049 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
5050 entry->ln = li->ln;
5051 if (entry->ln != NULL) {
5052 if (li->direction == REDIS_TAIL)
5053 li->ln = li->ln->next;
5054 else
5055 li->ln = li->ln->prev;
5056 return 1;
5057 }
d2ee16ab
PN
5058 } else {
5059 redisPanic("Unknown list encoding");
5060 }
be02a7c0 5061 return 0;
d2ee16ab
PN
5062}
5063
a6dd455b 5064/* Return entry or NULL at the current position of the iterator. */
003f0840
PN
5065static robj *listTypeGet(listTypeEntry *entry) {
5066 listTypeIterator *li = entry->li;
a6dd455b
PN
5067 robj *value = NULL;
5068 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
b6eb9703 5069 unsigned char *vstr;
a6dd455b 5070 unsigned int vlen;
b6eb9703 5071 long long vlong;
be02a7c0 5072 redisAssert(entry->zi != NULL);
b6eb9703
PN
5073 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5074 if (vstr) {
5075 value = createStringObject((char*)vstr,vlen);
a6dd455b 5076 } else {
b6eb9703 5077 value = createStringObjectFromLongLong(vlong);
a6dd455b
PN
5078 }
5079 }
5080 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
5081 redisAssert(entry->ln != NULL);
5082 value = listNodeValue(entry->ln);
a6dd455b
PN
5083 incrRefCount(value);
5084 } else {
5085 redisPanic("Unknown list encoding");
5086 }
5087 return value;
5088}
5089
bcfb3876
PN
5090static void listTypeInsert(listTypeEntry *entry, robj *value, int where) {
5091 robj *subject = entry->li->subject;
5092 if (entry->li->encoding == REDIS_ENCODING_ZIPLIST) {
244b873b 5093 value = getDecodedObject(value);
279d7e67 5094 if (where == REDIS_TAIL) {
bcfb3876
PN
5095 unsigned char *next = ziplistNext(subject->ptr,entry->zi);
5096
5097 /* When we insert after the current element, but the current element
5098 * is the tail of the list, we need to do a push. */
0e1684bc 5099 if (next == NULL) {
bcfb3876 5100 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),REDIS_TAIL);
0e1684bc 5101 } else {
bcfb3876 5102 subject->ptr = ziplistInsert(subject->ptr,next,value->ptr,sdslen(value->ptr));
0e1684bc
PN
5103 }
5104 } else {
bcfb3876 5105 subject->ptr = ziplistInsert(subject->ptr,entry->zi,value->ptr,sdslen(value->ptr));
0e1684bc 5106 }
244b873b 5107 decrRefCount(value);
bcfb3876 5108 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
279d7e67 5109 if (where == REDIS_TAIL) {
bcfb3876 5110 listInsertNode(subject->ptr,entry->ln,value,AL_START_TAIL);
0e1684bc 5111 } else {
bcfb3876 5112 listInsertNode(subject->ptr,entry->ln,value,AL_START_HEAD);
0e1684bc 5113 }
bcfb3876 5114 incrRefCount(value);
0e1684bc
PN
5115 } else {
5116 redisPanic("Unknown list encoding");
5117 }
5118}
5119
d2ee16ab 5120/* Compare the given object with the entry at the current position. */
003f0840
PN
5121static int listTypeEqual(listTypeEntry *entry, robj *o) {
5122 listTypeIterator *li = entry->li;
d2ee16ab
PN
5123 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5124 redisAssert(o->encoding == REDIS_ENCODING_RAW);
be02a7c0 5125 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
d2ee16ab 5126 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0 5127 return equalStringObjects(o,listNodeValue(entry->ln));
d2ee16ab
PN
5128 } else {
5129 redisPanic("Unknown list encoding");
5130 }
5131}
5132
be02a7c0 5133/* Delete the element pointed to. */
003f0840
PN
5134static void listTypeDelete(listTypeEntry *entry) {
5135 listTypeIterator *li = entry->li;
a6dd455b 5136 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
5137 unsigned char *p = entry->zi;
5138 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5139
5140 /* Update position of the iterator depending on the direction */
5141 if (li->direction == REDIS_TAIL)
5142 li->zi = p;
a6dd455b 5143 else
be02a7c0
PN
5144 li->zi = ziplistPrev(li->subject->ptr,p);
5145 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5146 listNode *next;
5147 if (li->direction == REDIS_TAIL)
5148 next = entry->ln->next;
a6dd455b 5149 else
be02a7c0
PN
5150 next = entry->ln->prev;
5151 listDelNode(li->subject->ptr,entry->ln);
5152 li->ln = next;
a6dd455b
PN
5153 } else {
5154 redisPanic("Unknown list encoding");
5155 }
5156}
3305306f 5157
003f0840
PN
5158static void listTypeConvert(robj *subject, int enc) {
5159 listTypeIterator *li;
5160 listTypeEntry entry;
d0686e07
PN
5161 redisAssert(subject->type == REDIS_LIST);
5162
5163 if (enc == REDIS_ENCODING_LIST) {
5164 list *l = listCreate();
cd627d4e 5165 listSetFreeMethod(l,decrRefCount);
d0686e07 5166
003f0840
PN
5167 /* listTypeGet returns a robj with incremented refcount */
5168 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5169 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5170 listTypeReleaseIterator(li);
d0686e07
PN
5171
5172 subject->encoding = REDIS_ENCODING_LIST;
5173 zfree(subject->ptr);
5174 subject->ptr = l;
5175 } else {
5176 redisPanic("Unsupported list conversion");
5177 }
5178}
5179
c7d9d662
PN
5180static void pushGenericCommand(redisClient *c, int where) {
5181 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
3305306f 5182 if (lobj == NULL) {
95242ab5 5183 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5184 addReply(c,shared.cone);
95242ab5 5185 return;
5186 }
1cd92e7f 5187 lobj = createZiplistObject();
09241813 5188 dbAdd(c->db,c->argv[1],lobj);
ed9b544e 5189 } else {
ed9b544e 5190 if (lobj->type != REDIS_LIST) {
5191 addReply(c,shared.wrongtypeerr);
5192 return;
5193 }
95242ab5 5194 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5195 addReply(c,shared.cone);
95242ab5 5196 return;
5197 }
ed9b544e 5198 }
003f0840
PN
5199 listTypePush(lobj,c->argv[2],where);
5200 addReplyLongLong(c,listTypeLength(lobj));
ed9b544e 5201 server.dirty++;
ed9b544e 5202}
5203
5204static void lpushCommand(redisClient *c) {
5205 pushGenericCommand(c,REDIS_HEAD);
5206}
5207
5208static void rpushCommand(redisClient *c) {
5209 pushGenericCommand(c,REDIS_TAIL);
5210}
5211
bcfb3876 5212static void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) {
dedff272
RP
5213 robj *subject;
5214 listTypeIterator *iter;
5215 listTypeEntry entry;
70b4b320 5216 int inserted = 0;
dedff272
RP
5217
5218 if ((subject = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5219 checkType(c,subject,REDIS_LIST)) return;
dedff272 5220
bcfb3876
PN
5221 if (refval != NULL) {
5222 /* Note: we expect refval to be string-encoded because it is *not* the
5223 * last argument of the multi-bulk LINSERT. */
5224 redisAssert(refval->encoding == REDIS_ENCODING_RAW);
5225
70b4b320
PN
5226 /* We're not sure if this value can be inserted yet, but we cannot
5227 * convert the list inside the iterator. We don't want to loop over
5228 * the list twice (once to see if the value can be inserted and once
5229 * to do the actual insert), so we assume this value can be inserted
5230 * and convert the ziplist to a regular list if necessary. */
5231 listTypeTryConversion(subject,val);
5232
bcfb3876 5233 /* Seek refval from head to tail */
1240552d 5234 iter = listTypeInitIterator(subject,0,REDIS_TAIL);
dedff272 5235 while (listTypeNext(iter,&entry)) {
bcfb3876
PN
5236 if (listTypeEqual(&entry,refval)) {
5237 listTypeInsert(&entry,val,where);
70b4b320 5238 inserted = 1;
dedff272
RP
5239 break;
5240 }
5241 }
5242 listTypeReleaseIterator(iter);
70b4b320
PN
5243
5244 if (inserted) {
5245 /* Check if the length exceeds the ziplist length threshold. */
5246 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
5247 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
5248 listTypeConvert(subject,REDIS_ENCODING_LIST);
5249 server.dirty++;
23d3a5fe
PN
5250 } else {
5251 /* Notify client of a failed insert */
5252 addReply(c,shared.cnegone);
5253 return;
70b4b320 5254 }
dedff272 5255 } else {
bcfb3876 5256 listTypePush(subject,val,where);
70b4b320 5257 server.dirty++;
dedff272
RP
5258 }
5259
dedff272
RP
5260 addReplyUlong(c,listTypeLength(subject));
5261}
5262
5263static void lpushxCommand(redisClient *c) {
bcfb3876 5264 pushxGenericCommand(c,NULL,c->argv[2],REDIS_HEAD);
dedff272
RP
5265}
5266
5267static void rpushxCommand(redisClient *c) {
bcfb3876 5268 pushxGenericCommand(c,NULL,c->argv[2],REDIS_TAIL);
dedff272
RP
5269}
5270
5271static void linsertCommand(redisClient *c) {
5272 if (strcasecmp(c->argv[2]->ptr,"after") == 0) {
bcfb3876 5273 pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_TAIL);
279d7e67 5274 } else if (strcasecmp(c->argv[2]->ptr,"before") == 0) {
bcfb3876 5275 pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_HEAD);
dedff272
RP
5276 } else {
5277 addReply(c,shared.syntaxerr);
5278 }
5279}
5280
ed9b544e 5281static void llenCommand(redisClient *c) {
d72562f7
PN
5282 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5283 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
003f0840 5284 addReplyUlong(c,listTypeLength(o));
ed9b544e 5285}
5286
5287static void lindexCommand(redisClient *c) {
697bd567
PN
5288 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5289 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5290 int index = atoi(c->argv[2]->ptr);
bd8db0ad 5291 robj *value = NULL;
dd88747b 5292
697bd567
PN
5293 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5294 unsigned char *p;
b6eb9703 5295 unsigned char *vstr;
697bd567 5296 unsigned int vlen;
b6eb9703 5297 long long vlong;
697bd567 5298 p = ziplistIndex(o->ptr,index);
b6eb9703
PN
5299 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5300 if (vstr) {
5301 value = createStringObject((char*)vstr,vlen);
697bd567 5302 } else {
b6eb9703 5303 value = createStringObjectFromLongLong(vlong);
697bd567 5304 }
bd8db0ad
PN
5305 addReplyBulk(c,value);
5306 decrRefCount(value);
697bd567
PN
5307 } else {
5308 addReply(c,shared.nullbulk);
5309 }
5310 } else if (o->encoding == REDIS_ENCODING_LIST) {
5311 listNode *ln = listIndex(o->ptr,index);
5312 if (ln != NULL) {
bd8db0ad
PN
5313 value = listNodeValue(ln);
5314 addReplyBulk(c,value);
697bd567
PN
5315 } else {
5316 addReply(c,shared.nullbulk);
5317 }
ed9b544e 5318 } else {
697bd567 5319 redisPanic("Unknown list encoding");
ed9b544e 5320 }
5321}
5322
5323static void lsetCommand(redisClient *c) {
697bd567
PN
5324 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5325 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5326 int index = atoi(c->argv[2]->ptr);
697bd567 5327 robj *value = c->argv[3];
dd88747b 5328
003f0840 5329 listTypeTryConversion(o,value);
697bd567
PN
5330 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5331 unsigned char *p, *zl = o->ptr;
5332 p = ziplistIndex(zl,index);
5333 if (p == NULL) {
5334 addReply(c,shared.outofrangeerr);
5335 } else {
be02a7c0 5336 o->ptr = ziplistDelete(o->ptr,&p);
697bd567
PN
5337 value = getDecodedObject(value);
5338 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5339 decrRefCount(value);
5340 addReply(c,shared.ok);
5341 server.dirty++;
5342 }
5343 } else if (o->encoding == REDIS_ENCODING_LIST) {
5344 listNode *ln = listIndex(o->ptr,index);
5345 if (ln == NULL) {
5346 addReply(c,shared.outofrangeerr);
5347 } else {
5348 decrRefCount((robj*)listNodeValue(ln));
5349 listNodeValue(ln) = value;
5350 incrRefCount(value);
5351 addReply(c,shared.ok);
5352 server.dirty++;
5353 }
ed9b544e 5354 } else {
697bd567 5355 redisPanic("Unknown list encoding");
ed9b544e 5356 }
5357}
5358
5359static void popGenericCommand(redisClient *c, int where) {
d72562f7
PN
5360 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5361 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
3305306f 5362
003f0840 5363 robj *value = listTypePop(o,where);
d72562f7 5364 if (value == NULL) {
dd88747b 5365 addReply(c,shared.nullbulk);
5366 } else {
d72562f7
PN
5367 addReplyBulk(c,value);
5368 decrRefCount(value);
003f0840 5369 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5370 server.dirty++;
ed9b544e 5371 }
5372}
5373
5374static void lpopCommand(redisClient *c) {
5375 popGenericCommand(c,REDIS_HEAD);
5376}
5377
5378static void rpopCommand(redisClient *c) {
5379 popGenericCommand(c,REDIS_TAIL);
5380}
5381
5382static void lrangeCommand(redisClient *c) {
a6dd455b 5383 robj *o, *value;
ed9b544e 5384 int start = atoi(c->argv[2]->ptr);
5385 int end = atoi(c->argv[3]->ptr);
dd88747b 5386 int llen;
5387 int rangelen, j;
003f0840 5388 listTypeEntry entry;
dd88747b 5389
4e27f268 5390 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5391 || checkType(c,o,REDIS_LIST)) return;
003f0840 5392 llen = listTypeLength(o);
dd88747b 5393
5394 /* convert negative indexes */
5395 if (start < 0) start = llen+start;
5396 if (end < 0) end = llen+end;
5397 if (start < 0) start = 0;
5398 if (end < 0) end = 0;
5399
5400 /* indexes sanity checks */
5401 if (start > end || start >= llen) {
5402 /* Out of range start or start > end result in empty list */
5403 addReply(c,shared.emptymultibulk);
5404 return;
5405 }
5406 if (end >= llen) end = llen-1;
5407 rangelen = (end-start)+1;
3305306f 5408
dd88747b 5409 /* Return the result in form of a multi-bulk reply */
dd88747b 5410 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
003f0840 5411 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
dd88747b 5412 for (j = 0; j < rangelen; j++) {
003f0840
PN
5413 redisAssert(listTypeNext(li,&entry));
5414 value = listTypeGet(&entry);
a6dd455b 5415 addReplyBulk(c,value);
be02a7c0 5416 decrRefCount(value);
ed9b544e 5417 }
003f0840 5418 listTypeReleaseIterator(li);
ed9b544e 5419}
5420
5421static void ltrimCommand(redisClient *c) {
3305306f 5422 robj *o;
ed9b544e 5423 int start = atoi(c->argv[2]->ptr);
5424 int end = atoi(c->argv[3]->ptr);
dd88747b 5425 int llen;
5426 int j, ltrim, rtrim;
5427 list *list;
5428 listNode *ln;
5429
5430 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5431 checkType(c,o,REDIS_LIST)) return;
003f0840 5432 llen = listTypeLength(o);
dd88747b 5433
5434 /* convert negative indexes */
5435 if (start < 0) start = llen+start;
5436 if (end < 0) end = llen+end;
5437 if (start < 0) start = 0;
5438 if (end < 0) end = 0;
5439
5440 /* indexes sanity checks */
5441 if (start > end || start >= llen) {
5442 /* Out of range start or start > end result in empty list */
5443 ltrim = llen;
5444 rtrim = 0;
ed9b544e 5445 } else {
dd88747b 5446 if (end >= llen) end = llen-1;
5447 ltrim = start;
5448 rtrim = llen-end-1;
5449 }
ed9b544e 5450
dd88747b 5451 /* Remove list elements to perform the trim */
9ae6b0be
PN
5452 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5453 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5454 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5455 } else if (o->encoding == REDIS_ENCODING_LIST) {
5456 list = o->ptr;
5457 for (j = 0; j < ltrim; j++) {
5458 ln = listFirst(list);
5459 listDelNode(list,ln);
5460 }
5461 for (j = 0; j < rtrim; j++) {
5462 ln = listLast(list);
5463 listDelNode(list,ln);
5464 }
5465 } else {
5466 redisPanic("Unknown list encoding");
ed9b544e 5467 }
003f0840 5468 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5469 server.dirty++;
5470 addReply(c,shared.ok);
ed9b544e 5471}
5472
5473static void lremCommand(redisClient *c) {
d2ee16ab 5474 robj *subject, *obj = c->argv[3];
dd88747b 5475 int toremove = atoi(c->argv[2]->ptr);
5476 int removed = 0;
003f0840 5477 listTypeEntry entry;
a4d1ba9a 5478
d2ee16ab
PN
5479 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5480 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
dd88747b 5481
d2ee16ab
PN
5482 /* Make sure obj is raw when we're dealing with a ziplist */
5483 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5484 obj = getDecodedObject(obj);
5485
003f0840 5486 listTypeIterator *li;
dd88747b 5487 if (toremove < 0) {
5488 toremove = -toremove;
003f0840 5489 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
d2ee16ab 5490 } else {
003f0840 5491 li = listTypeInitIterator(subject,0,REDIS_TAIL);
dd88747b 5492 }
dd88747b 5493
003f0840
PN
5494 while (listTypeNext(li,&entry)) {
5495 if (listTypeEqual(&entry,obj)) {
5496 listTypeDelete(&entry);
dd88747b 5497 server.dirty++;
5498 removed++;
3fbf9001 5499 if (toremove && removed == toremove) break;
ed9b544e 5500 }
5501 }
003f0840 5502 listTypeReleaseIterator(li);
d2ee16ab
PN
5503
5504 /* Clean up raw encoded object */
5505 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5506 decrRefCount(obj);
5507
003f0840 5508 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5509 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5510}
5511
12f9d551 5512/* This is the semantic of this command:
0f5f7e9a 5513 * RPOPLPUSH srclist dstlist:
12f9d551 5514 * IF LLEN(srclist) > 0
5515 * element = RPOP srclist
5516 * LPUSH dstlist element
5517 * RETURN element
5518 * ELSE
5519 * RETURN nil
5520 * END
5521 * END
5522 *
5523 * The idea is to be able to get an element from a list in a reliable way
5524 * since the element is not just returned but pushed against another list
5525 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5526 */
0f5f7e9a 5527static void rpoplpushcommand(redisClient *c) {
0f62e177 5528 robj *sobj, *value;
dd88747b 5529 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5530 checkType(c,sobj,REDIS_LIST)) return;
12f9d551 5531
003f0840 5532 if (listTypeLength(sobj) == 0) {
12f9d551 5533 addReply(c,shared.nullbulk);
5534 } else {
dd88747b 5535 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
0f62e177 5536 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
003f0840 5537 value = listTypePop(sobj,REDIS_TAIL);
12f9d551 5538
dd88747b 5539 /* Add the element to the target list (unless it's directly
5540 * passed to some BLPOP-ing client */
0f62e177
PN
5541 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5542 /* Create the list if the key does not exist */
5543 if (!dobj) {
1cd92e7f 5544 dobj = createZiplistObject();
09241813 5545 dbAdd(c->db,c->argv[2],dobj);
12f9d551 5546 }
003f0840 5547 listTypePush(dobj,value,REDIS_HEAD);
12f9d551 5548 }
dd88747b 5549
5550 /* Send the element to the client as reply as well */
0f62e177
PN
5551 addReplyBulk(c,value);
5552
003f0840 5553 /* listTypePop returns an object with its refcount incremented */
0f62e177 5554 decrRefCount(value);
dd88747b 5555
0f62e177 5556 /* Delete the source list when it is empty */
003f0840 5557 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5558 server.dirty++;
12f9d551 5559 }
5560}
5561
ed9b544e 5562/* ==================================== Sets ================================ */
5563
5564static void saddCommand(redisClient *c) {
ed9b544e 5565 robj *set;
5566
3305306f 5567 set = lookupKeyWrite(c->db,c->argv[1]);
5568 if (set == NULL) {
ed9b544e 5569 set = createSetObject();
09241813 5570 dbAdd(c->db,c->argv[1],set);
ed9b544e 5571 } else {
ed9b544e 5572 if (set->type != REDIS_SET) {
c937aa89 5573 addReply(c,shared.wrongtypeerr);
ed9b544e 5574 return;
5575 }
5576 }
5577 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5578 incrRefCount(c->argv[2]);
5579 server.dirty++;
c937aa89 5580 addReply(c,shared.cone);
ed9b544e 5581 } else {
c937aa89 5582 addReply(c,shared.czero);
ed9b544e 5583 }
5584}
5585
5586static void sremCommand(redisClient *c) {
3305306f 5587 robj *set;
ed9b544e 5588
dd88747b 5589 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5590 checkType(c,set,REDIS_SET)) return;
5591
5592 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5593 server.dirty++;
5594 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
09241813 5595 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5596 addReply(c,shared.cone);
ed9b544e 5597 } else {
dd88747b 5598 addReply(c,shared.czero);
ed9b544e 5599 }
5600}
5601
a4460ef4 5602static void smoveCommand(redisClient *c) {
5603 robj *srcset, *dstset;
5604
5605 srcset = lookupKeyWrite(c->db,c->argv[1]);
5606 dstset = lookupKeyWrite(c->db,c->argv[2]);
5607
5608 /* If the source key does not exist return 0, if it's of the wrong type
5609 * raise an error */
5610 if (srcset == NULL || srcset->type != REDIS_SET) {
5611 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5612 return;
5613 }
5614 /* Error if the destination key is not a set as well */
5615 if (dstset && dstset->type != REDIS_SET) {
5616 addReply(c,shared.wrongtypeerr);
5617 return;
5618 }
5619 /* Remove the element from the source set */
5620 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5621 /* Key not found in the src set! return zero */
5622 addReply(c,shared.czero);
5623 return;
5624 }
3ea27d37 5625 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
09241813 5626 dbDelete(c->db,c->argv[1]);
a4460ef4 5627 server.dirty++;
5628 /* Add the element to the destination set */
5629 if (!dstset) {
5630 dstset = createSetObject();
09241813 5631 dbAdd(c->db,c->argv[2],dstset);
a4460ef4 5632 }
5633 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5634 incrRefCount(c->argv[3]);
5635 addReply(c,shared.cone);
5636}
5637
ed9b544e 5638static void sismemberCommand(redisClient *c) {
3305306f 5639 robj *set;
ed9b544e 5640
dd88747b 5641 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5642 checkType(c,set,REDIS_SET)) return;
5643
5644 if (dictFind(set->ptr,c->argv[2]))
5645 addReply(c,shared.cone);
5646 else
c937aa89 5647 addReply(c,shared.czero);
ed9b544e 5648}
5649
5650static void scardCommand(redisClient *c) {
3305306f 5651 robj *o;
ed9b544e 5652 dict *s;
dd88747b 5653
5654 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5655 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5656
dd88747b 5657 s = o->ptr;
5658 addReplyUlong(c,dictSize(s));
ed9b544e 5659}
5660
12fea928 5661static void spopCommand(redisClient *c) {
5662 robj *set;
5663 dictEntry *de;
5664
dd88747b 5665 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5666 checkType(c,set,REDIS_SET)) return;
5667
5668 de = dictGetRandomKey(set->ptr);
5669 if (de == NULL) {
12fea928 5670 addReply(c,shared.nullbulk);
5671 } else {
dd88747b 5672 robj *ele = dictGetEntryKey(de);
12fea928 5673
dd88747b 5674 addReplyBulk(c,ele);
5675 dictDelete(set->ptr,ele);
5676 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
09241813 5677 if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5678 server.dirty++;
12fea928 5679 }
5680}
5681
2abb95a9 5682static void srandmemberCommand(redisClient *c) {
5683 robj *set;
5684 dictEntry *de;
5685
dd88747b 5686 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5687 checkType(c,set,REDIS_SET)) return;
5688
5689 de = dictGetRandomKey(set->ptr);
5690 if (de == NULL) {
2abb95a9 5691 addReply(c,shared.nullbulk);
5692 } else {
dd88747b 5693 robj *ele = dictGetEntryKey(de);
2abb95a9 5694
dd88747b 5695 addReplyBulk(c,ele);
2abb95a9 5696 }
5697}
5698
ed9b544e 5699static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5700 dict **d1 = (void*) s1, **d2 = (void*) s2;
5701
3305306f 5702 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5703}
5704
682ac724 5705static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5706 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5707 dictIterator *di;
5708 dictEntry *de;
5709 robj *lenobj = NULL, *dstset = NULL;
682ac724 5710 unsigned long j, cardinality = 0;
ed9b544e 5711
ed9b544e 5712 for (j = 0; j < setsnum; j++) {
5713 robj *setobj;
3305306f 5714
5715 setobj = dstkey ?
5716 lookupKeyWrite(c->db,setskeys[j]) :
5717 lookupKeyRead(c->db,setskeys[j]);
5718 if (!setobj) {
ed9b544e 5719 zfree(dv);
5faa6025 5720 if (dstkey) {
09241813 5721 if (dbDelete(c->db,dstkey))
fdcaae84 5722 server.dirty++;
0d36ded0 5723 addReply(c,shared.czero);
5faa6025 5724 } else {
4e27f268 5725 addReply(c,shared.emptymultibulk);
5faa6025 5726 }
ed9b544e 5727 return;
5728 }
ed9b544e 5729 if (setobj->type != REDIS_SET) {
5730 zfree(dv);
c937aa89 5731 addReply(c,shared.wrongtypeerr);
ed9b544e 5732 return;
5733 }
5734 dv[j] = setobj->ptr;
5735 }
5736 /* Sort sets from the smallest to largest, this will improve our
5737 * algorithm's performace */
5738 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5739
5740 /* The first thing we should output is the total number of elements...
5741 * since this is a multi-bulk write, but at this stage we don't know
5742 * the intersection set size, so we use a trick, append an empty object
5743 * to the output list and save the pointer to later modify it with the
5744 * right length */
5745 if (!dstkey) {
5746 lenobj = createObject(REDIS_STRING,NULL);
5747 addReply(c,lenobj);
5748 decrRefCount(lenobj);
5749 } else {
5750 /* If we have a target key where to store the resulting set
5751 * create this key with an empty set inside */
5752 dstset = createSetObject();
ed9b544e 5753 }
5754
5755 /* Iterate all the elements of the first (smallest) set, and test
5756 * the element against all the other sets, if at least one set does
5757 * not include the element it is discarded */
5758 di = dictGetIterator(dv[0]);
ed9b544e 5759
5760 while((de = dictNext(di)) != NULL) {
5761 robj *ele;
5762
5763 for (j = 1; j < setsnum; j++)
5764 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5765 if (j != setsnum)
5766 continue; /* at least one set does not contain the member */
5767 ele = dictGetEntryKey(de);
5768 if (!dstkey) {
dd88747b 5769 addReplyBulk(c,ele);
ed9b544e 5770 cardinality++;
5771 } else {
5772 dictAdd(dstset->ptr,ele,NULL);
5773 incrRefCount(ele);
5774 }
5775 }
5776 dictReleaseIterator(di);
5777
83cdfe18 5778 if (dstkey) {
3ea27d37 5779 /* Store the resulting set into the target, if the intersection
5780 * is not an empty set. */
09241813 5781 dbDelete(c->db,dstkey);
3ea27d37 5782 if (dictSize((dict*)dstset->ptr) > 0) {
09241813 5783 dbAdd(c->db,dstkey,dstset);
482b672d 5784 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5785 } else {
5786 decrRefCount(dstset);
d36c4e97 5787 addReply(c,shared.czero);
3ea27d37 5788 }
40d224a9 5789 server.dirty++;
d36c4e97 5790 } else {
5791 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5792 }
ed9b544e 5793 zfree(dv);
5794}
5795
5796static void sinterCommand(redisClient *c) {
5797 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5798}
5799
5800static void sinterstoreCommand(redisClient *c) {
5801 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5802}
5803
f4f56e1d 5804#define REDIS_OP_UNION 0
5805#define REDIS_OP_DIFF 1
2830ca53 5806#define REDIS_OP_INTER 2
f4f56e1d 5807
5808static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5809 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5810 dictIterator *di;
5811 dictEntry *de;
f4f56e1d 5812 robj *dstset = NULL;
40d224a9 5813 int j, cardinality = 0;
5814
40d224a9 5815 for (j = 0; j < setsnum; j++) {
5816 robj *setobj;
5817
5818 setobj = dstkey ?
5819 lookupKeyWrite(c->db,setskeys[j]) :
5820 lookupKeyRead(c->db,setskeys[j]);
5821 if (!setobj) {
5822 dv[j] = NULL;
5823 continue;
5824 }
5825 if (setobj->type != REDIS_SET) {
5826 zfree(dv);
5827 addReply(c,shared.wrongtypeerr);
5828 return;
5829 }
5830 dv[j] = setobj->ptr;
5831 }
5832
5833 /* We need a temp set object to store our union. If the dstkey
5834 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5835 * this set object will be the resulting object to set into the target key*/
5836 dstset = createSetObject();
5837
40d224a9 5838 /* Iterate all the elements of all the sets, add every element a single
5839 * time to the result set */
5840 for (j = 0; j < setsnum; j++) {
51829ed3 5841 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5842 if (!dv[j]) continue; /* non existing keys are like empty sets */
5843
5844 di = dictGetIterator(dv[j]);
40d224a9 5845
5846 while((de = dictNext(di)) != NULL) {
5847 robj *ele;
5848
5849 /* dictAdd will not add the same element multiple times */
5850 ele = dictGetEntryKey(de);
f4f56e1d 5851 if (op == REDIS_OP_UNION || j == 0) {
5852 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5853 incrRefCount(ele);
40d224a9 5854 cardinality++;
5855 }
f4f56e1d 5856 } else if (op == REDIS_OP_DIFF) {
5857 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5858 cardinality--;
5859 }
40d224a9 5860 }
5861 }
5862 dictReleaseIterator(di);
51829ed3 5863
d36c4e97 5864 /* result set is empty? Exit asap. */
5865 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5866 }
5867
f4f56e1d 5868 /* Output the content of the resulting set, if not in STORE mode */
5869 if (!dstkey) {
5870 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5871 di = dictGetIterator(dstset->ptr);
f4f56e1d 5872 while((de = dictNext(di)) != NULL) {
5873 robj *ele;
5874
5875 ele = dictGetEntryKey(de);
dd88747b 5876 addReplyBulk(c,ele);
f4f56e1d 5877 }
5878 dictReleaseIterator(di);
d36c4e97 5879 decrRefCount(dstset);
83cdfe18
AG
5880 } else {
5881 /* If we have a target key where to store the resulting set
5882 * create this key with the result set inside */
09241813 5883 dbDelete(c->db,dstkey);
3ea27d37 5884 if (dictSize((dict*)dstset->ptr) > 0) {
09241813 5885 dbAdd(c->db,dstkey,dstset);
482b672d 5886 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5887 } else {
5888 decrRefCount(dstset);
d36c4e97 5889 addReply(c,shared.czero);
3ea27d37 5890 }
40d224a9 5891 server.dirty++;
5892 }
5893 zfree(dv);
5894}
5895
5896static void sunionCommand(redisClient *c) {
f4f56e1d 5897 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5898}
5899
5900static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5901 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5902}
5903
5904static void sdiffCommand(redisClient *c) {
5905 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5906}
5907
5908static void sdiffstoreCommand(redisClient *c) {
5909 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5910}
5911
6b47e12e 5912/* ==================================== ZSets =============================== */
5913
5914/* ZSETs are ordered sets using two data structures to hold the same elements
5915 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5916 * data structure.
5917 *
5918 * The elements are added to an hash table mapping Redis objects to scores.
5919 * At the same time the elements are added to a skip list mapping scores
5920 * to Redis objects (so objects are sorted by scores in this "view"). */
5921
5922/* This skiplist implementation is almost a C translation of the original
5923 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5924 * Alternative to Balanced Trees", modified in three ways:
5925 * a) this implementation allows for repeated values.
5926 * b) the comparison is not just by key (our 'score') but by satellite data.
5927 * c) there is a back pointer, so it's a doubly linked list with the back
5928 * pointers being only at "level 1". This allows to traverse the list
5929 * from tail to head, useful for ZREVRANGE. */
5930
5931static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5932 zskiplistNode *zn = zmalloc(sizeof(*zn));
5933
5934 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 5935 if (level > 1)
2b37892e 5936 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 5937 else
5938 zn->span = NULL;
6b47e12e 5939 zn->score = score;
5940 zn->obj = obj;
5941 return zn;
5942}
5943
5944static zskiplist *zslCreate(void) {
5945 int j;
5946 zskiplist *zsl;
e0a62c7f 5947
6b47e12e 5948 zsl = zmalloc(sizeof(*zsl));
5949 zsl->level = 1;
cc812361 5950 zsl->length = 0;
6b47e12e 5951 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5952 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5953 zsl->header->forward[j] = NULL;
94e543b5 5954
5955 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5956 if (j < ZSKIPLIST_MAXLEVEL-1)
5957 zsl->header->span[j] = 0;
69d95c3e 5958 }
e3870fab 5959 zsl->header->backward = NULL;
5960 zsl->tail = NULL;
6b47e12e 5961 return zsl;
5962}
5963
fd8ccf44 5964static void zslFreeNode(zskiplistNode *node) {
5965 decrRefCount(node->obj);
ad807e6f 5966 zfree(node->forward);
69d95c3e 5967 zfree(node->span);
fd8ccf44 5968 zfree(node);
5969}
5970
5971static void zslFree(zskiplist *zsl) {
ad807e6f 5972 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5973
ad807e6f 5974 zfree(zsl->header->forward);
69d95c3e 5975 zfree(zsl->header->span);
ad807e6f 5976 zfree(zsl->header);
fd8ccf44 5977 while(node) {
599379dd 5978 next = node->forward[0];
fd8ccf44 5979 zslFreeNode(node);
5980 node = next;
5981 }
ad807e6f 5982 zfree(zsl);
fd8ccf44 5983}
5984
6b47e12e 5985static int zslRandomLevel(void) {
5986 int level = 1;
5987 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5988 level += 1;
10c2baa5 5989 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5990}
5991
5992static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5993 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5994 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5995 int i, level;
5996
5997 x = zsl->header;
5998 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5999 /* store rank that is crossed to reach the insert position */
6000 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 6001
9d60e6e4 6002 while (x->forward[i] &&
6003 (x->forward[i]->score < score ||
6004 (x->forward[i]->score == score &&
69d95c3e 6005 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 6006 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 6007 x = x->forward[i];
69d95c3e 6008 }
6b47e12e 6009 update[i] = x;
6010 }
6b47e12e 6011 /* we assume the key is not already inside, since we allow duplicated
6012 * scores, and the re-insertion of score and redis object should never
6013 * happpen since the caller of zslInsert() should test in the hash table
6014 * if the element is already inside or not. */
6015 level = zslRandomLevel();
6016 if (level > zsl->level) {
69d95c3e 6017 for (i = zsl->level; i < level; i++) {
2b37892e 6018 rank[i] = 0;
6b47e12e 6019 update[i] = zsl->header;
2b37892e 6020 update[i]->span[i-1] = zsl->length;
69d95c3e 6021 }
6b47e12e 6022 zsl->level = level;
6023 }
6024 x = zslCreateNode(level,score,obj);
6025 for (i = 0; i < level; i++) {
6026 x->forward[i] = update[i]->forward[i];
6027 update[i]->forward[i] = x;
69d95c3e
PN
6028
6029 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
6030 if (i > 0) {
6031 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
6032 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
6033 }
6b47e12e 6034 }
69d95c3e
PN
6035
6036 /* increment span for untouched levels */
6037 for (i = level; i < zsl->level; i++) {
2b37892e 6038 update[i]->span[i-1]++;
69d95c3e
PN
6039 }
6040
bb975144 6041 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 6042 if (x->forward[0])
6043 x->forward[0]->backward = x;
6044 else
6045 zsl->tail = x;
cc812361 6046 zsl->length++;
6b47e12e 6047}
6048
84105336
PN
6049/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6050void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6051 int i;
6052 for (i = 0; i < zsl->level; i++) {
6053 if (update[i]->forward[i] == x) {
6054 if (i > 0) {
6055 update[i]->span[i-1] += x->span[i-1] - 1;
6056 }
6057 update[i]->forward[i] = x->forward[i];
6058 } else {
6059 /* invariant: i > 0, because update[0]->forward[0]
6060 * is always equal to x */
6061 update[i]->span[i-1] -= 1;
6062 }
6063 }
6064 if (x->forward[0]) {
6065 x->forward[0]->backward = x->backward;
6066 } else {
6067 zsl->tail = x->backward;
6068 }
6069 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6070 zsl->level--;
6071 zsl->length--;
6072}
6073
50c55df5 6074/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 6075static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 6076 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6077 int i;
6078
6079 x = zsl->header;
6080 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 6081 while (x->forward[i] &&
6082 (x->forward[i]->score < score ||
6083 (x->forward[i]->score == score &&
6084 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 6085 x = x->forward[i];
6086 update[i] = x;
6087 }
6088 /* We may have multiple elements with the same score, what we need
6089 * is to find the element with both the right score and object. */
6090 x = x->forward[0];
bf028098 6091 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 6092 zslDeleteNode(zsl, x, update);
9d60e6e4 6093 zslFreeNode(x);
9d60e6e4 6094 return 1;
6095 } else {
6096 return 0; /* not found */
e197b441 6097 }
6098 return 0; /* not found */
fd8ccf44 6099}
6100
1807985b 6101/* Delete all the elements with score between min and max from the skiplist.
6102 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6103 * Note that this function takes the reference to the hash table view of the
6104 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 6105static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 6106 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6107 unsigned long removed = 0;
6108 int i;
6109
6110 x = zsl->header;
6111 for (i = zsl->level-1; i >= 0; i--) {
6112 while (x->forward[i] && x->forward[i]->score < min)
6113 x = x->forward[i];
6114 update[i] = x;
6115 }
6116 /* We may have multiple elements with the same score, what we need
6117 * is to find the element with both the right score and object. */
6118 x = x->forward[0];
6119 while (x && x->score <= max) {
84105336
PN
6120 zskiplistNode *next = x->forward[0];
6121 zslDeleteNode(zsl, x, update);
1807985b 6122 dictDelete(dict,x->obj);
6123 zslFreeNode(x);
1807985b 6124 removed++;
6125 x = next;
6126 }
6127 return removed; /* not found */
6128}
1807985b 6129
9212eafd 6130/* Delete all the elements with rank between start and end from the skiplist.
2424490f 6131 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
6132static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6133 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6134 unsigned long traversed = 0, removed = 0;
6135 int i;
6136
9212eafd
PN
6137 x = zsl->header;
6138 for (i = zsl->level-1; i >= 0; i--) {
6139 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6140 traversed += i > 0 ? x->span[i-1] : 1;
6141 x = x->forward[i];
1807985b 6142 }
9212eafd
PN
6143 update[i] = x;
6144 }
6145
6146 traversed++;
6147 x = x->forward[0];
6148 while (x && traversed <= end) {
84105336
PN
6149 zskiplistNode *next = x->forward[0];
6150 zslDeleteNode(zsl, x, update);
1807985b 6151 dictDelete(dict,x->obj);
6152 zslFreeNode(x);
1807985b 6153 removed++;
9212eafd 6154 traversed++;
1807985b 6155 x = next;
6156 }
9212eafd 6157 return removed;
1807985b 6158}
6159
50c55df5 6160/* Find the first node having a score equal or greater than the specified one.
6161 * Returns NULL if there is no match. */
6162static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6163 zskiplistNode *x;
6164 int i;
6165
6166 x = zsl->header;
6167 for (i = zsl->level-1; i >= 0; i--) {
6168 while (x->forward[i] && x->forward[i]->score < score)
6169 x = x->forward[i];
6170 }
6171 /* We may have multiple elements with the same score, what we need
6172 * is to find the element with both the right score and object. */
6173 return x->forward[0];
6174}
6175
27b0ccca
PN
6176/* Find the rank for an element by both score and key.
6177 * Returns 0 when the element cannot be found, rank otherwise.
6178 * Note that the rank is 1-based due to the span of zsl->header to the
6179 * first element. */
003f0840 6180static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
27b0ccca
PN
6181 zskiplistNode *x;
6182 unsigned long rank = 0;
6183 int i;
6184
6185 x = zsl->header;
6186 for (i = zsl->level-1; i >= 0; i--) {
6187 while (x->forward[i] &&
6188 (x->forward[i]->score < score ||
6189 (x->forward[i]->score == score &&
6190 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 6191 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
6192 x = x->forward[i];
6193 }
6194
6195 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 6196 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
6197 return rank;
6198 }
6199 }
6200 return 0;
6201}
6202
e74825c2 6203/* Finds an element by its rank. The rank argument needs to be 1-based. */
003f0840 6204zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
e74825c2
PN
6205 zskiplistNode *x;
6206 unsigned long traversed = 0;
6207 int i;
6208
6209 x = zsl->header;
6210 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 6211 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6212 {
a50ea45c 6213 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
6214 x = x->forward[i];
6215 }
e74825c2
PN
6216 if (traversed == rank) {
6217 return x;
6218 }
6219 }
6220 return NULL;
6221}
6222
fd8ccf44 6223/* The actual Z-commands implementations */
6224
7db723ad 6225/* This generic command implements both ZADD and ZINCRBY.
e2665397 6226 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 6227 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 6228static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 6229 robj *zsetobj;
6230 zset *zs;
6231 double *score;
6232
5fc9229c 6233 if (isnan(scoreval)) {
6234 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6235 return;
6236 }
6237
e2665397 6238 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 6239 if (zsetobj == NULL) {
6240 zsetobj = createZsetObject();
09241813 6241 dbAdd(c->db,key,zsetobj);
fd8ccf44 6242 } else {
6243 if (zsetobj->type != REDIS_ZSET) {
6244 addReply(c,shared.wrongtypeerr);
6245 return;
6246 }
6247 }
fd8ccf44 6248 zs = zsetobj->ptr;
e2665397 6249
7db723ad 6250 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 6251 * needs to handle the two different conditions. It's all about setting
6252 * '*score', that is, the new score to set, to the right value. */
6253 score = zmalloc(sizeof(double));
6254 if (doincrement) {
6255 dictEntry *de;
6256
6257 /* Read the old score. If the element was not present starts from 0 */
6258 de = dictFind(zs->dict,ele);
6259 if (de) {
6260 double *oldscore = dictGetEntryVal(de);
6261 *score = *oldscore + scoreval;
6262 } else {
6263 *score = scoreval;
6264 }
5fc9229c 6265 if (isnan(*score)) {
6266 addReplySds(c,
6267 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6268 zfree(score);
6269 /* Note that we don't need to check if the zset may be empty and
6270 * should be removed here, as we can only obtain Nan as score if
6271 * there was already an element in the sorted set. */
6272 return;
6273 }
e2665397 6274 } else {
6275 *score = scoreval;
6276 }
6277
6278 /* What follows is a simple remove and re-insert operation that is common
7db723ad 6279 * to both ZADD and ZINCRBY... */
e2665397 6280 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 6281 /* case 1: New element */
e2665397 6282 incrRefCount(ele); /* added to hash */
6283 zslInsert(zs->zsl,*score,ele);
6284 incrRefCount(ele); /* added to skiplist */
fd8ccf44 6285 server.dirty++;
e2665397 6286 if (doincrement)
e2665397 6287 addReplyDouble(c,*score);
91d71bfc 6288 else
6289 addReply(c,shared.cone);
fd8ccf44 6290 } else {
6291 dictEntry *de;
6292 double *oldscore;
e0a62c7f 6293
fd8ccf44 6294 /* case 2: Score update operation */
e2665397 6295 de = dictFind(zs->dict,ele);
dfc5e96c 6296 redisAssert(de != NULL);
fd8ccf44 6297 oldscore = dictGetEntryVal(de);
6298 if (*score != *oldscore) {
6299 int deleted;
6300
e2665397 6301 /* Remove and insert the element in the skip list with new score */
6302 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 6303 redisAssert(deleted != 0);
e2665397 6304 zslInsert(zs->zsl,*score,ele);
6305 incrRefCount(ele);
6306 /* Update the score in the hash table */
6307 dictReplace(zs->dict,ele,score);
fd8ccf44 6308 server.dirty++;
2161a965 6309 } else {
6310 zfree(score);
fd8ccf44 6311 }
e2665397 6312 if (doincrement)
6313 addReplyDouble(c,*score);
6314 else
6315 addReply(c,shared.czero);
fd8ccf44 6316 }
6317}
6318
e2665397 6319static void zaddCommand(redisClient *c) {
6320 double scoreval;
6321
bd79a6bd 6322 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6323 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6324}
6325
7db723ad 6326static void zincrbyCommand(redisClient *c) {
e2665397 6327 double scoreval;
6328
bd79a6bd 6329 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6330 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6331}
6332
1b7106e7 6333static void zremCommand(redisClient *c) {
6334 robj *zsetobj;
6335 zset *zs;
dd88747b 6336 dictEntry *de;
6337 double *oldscore;
6338 int deleted;
1b7106e7 6339
dd88747b 6340 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6341 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 6342
dd88747b 6343 zs = zsetobj->ptr;
6344 de = dictFind(zs->dict,c->argv[2]);
6345 if (de == NULL) {
6346 addReply(c,shared.czero);
6347 return;
1b7106e7 6348 }
dd88747b 6349 /* Delete from the skiplist */
6350 oldscore = dictGetEntryVal(de);
6351 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6352 redisAssert(deleted != 0);
6353
6354 /* Delete from the hash table */
6355 dictDelete(zs->dict,c->argv[2]);
6356 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6357 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6358 server.dirty++;
6359 addReply(c,shared.cone);
1b7106e7 6360}
6361
1807985b 6362static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
6363 double min;
6364 double max;
dd88747b 6365 long deleted;
1807985b 6366 robj *zsetobj;
6367 zset *zs;
6368
bd79a6bd
PN
6369 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6370 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 6371
dd88747b 6372 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6373 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 6374
dd88747b 6375 zs = zsetobj->ptr;
6376 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6377 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6378 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6379 server.dirty += deleted;
482b672d 6380 addReplyLongLong(c,deleted);
1807985b 6381}
6382
9212eafd 6383static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
6384 long start;
6385 long end;
dd88747b 6386 int llen;
6387 long deleted;
9212eafd
PN
6388 robj *zsetobj;
6389 zset *zs;
6390
bd79a6bd
PN
6391 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6392 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6393
dd88747b 6394 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6395 checkType(c,zsetobj,REDIS_ZSET)) return;
6396 zs = zsetobj->ptr;
6397 llen = zs->zsl->length;
9212eafd 6398
dd88747b 6399 /* convert negative indexes */
6400 if (start < 0) start = llen+start;
6401 if (end < 0) end = llen+end;
6402 if (start < 0) start = 0;
6403 if (end < 0) end = 0;
9212eafd 6404
dd88747b 6405 /* indexes sanity checks */
6406 if (start > end || start >= llen) {
6407 addReply(c,shared.czero);
6408 return;
9212eafd 6409 }
dd88747b 6410 if (end >= llen) end = llen-1;
6411
6412 /* increment start and end because zsl*Rank functions
6413 * use 1-based rank */
6414 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6415 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6416 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6417 server.dirty += deleted;
482b672d 6418 addReplyLongLong(c, deleted);
9212eafd
PN
6419}
6420
8f92e768
PN
6421typedef struct {
6422 dict *dict;
6423 double weight;
6424} zsetopsrc;
6425
6426static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6427 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6428 unsigned long size1, size2;
6429 size1 = d1->dict ? dictSize(d1->dict) : 0;
6430 size2 = d2->dict ? dictSize(d2->dict) : 0;
6431 return size1 - size2;
6432}
6433
d2764cd6
PN
6434#define REDIS_AGGR_SUM 1
6435#define REDIS_AGGR_MIN 2
6436#define REDIS_AGGR_MAX 3
bc000c1d 6437#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
6438
6439inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6440 if (aggregate == REDIS_AGGR_SUM) {
6441 *target = *target + val;
6442 } else if (aggregate == REDIS_AGGR_MIN) {
6443 *target = val < *target ? val : *target;
6444 } else if (aggregate == REDIS_AGGR_MAX) {
6445 *target = val > *target ? val : *target;
6446 } else {
6447 /* safety net */
f83c6cb5 6448 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
6449 }
6450}
6451
2830ca53 6452static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 6453 int i, j, setnum;
d2764cd6 6454 int aggregate = REDIS_AGGR_SUM;
8f92e768 6455 zsetopsrc *src;
2830ca53
PN
6456 robj *dstobj;
6457 zset *dstzset;
b287c9bb
PN
6458 dictIterator *di;
6459 dictEntry *de;
6460
bc000c1d
JC
6461 /* expect setnum input keys to be given */
6462 setnum = atoi(c->argv[2]->ptr);
6463 if (setnum < 1) {
5d373da9 6464 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 6465 return;
b287c9bb 6466 }
2830ca53
PN
6467
6468 /* test if the expected number of keys would overflow */
bc000c1d 6469 if (3+setnum > c->argc) {
b287c9bb
PN
6470 addReply(c,shared.syntaxerr);
6471 return;
6472 }
6473
2830ca53 6474 /* read keys to be used for input */
bc000c1d
JC
6475 src = zmalloc(sizeof(zsetopsrc) * setnum);
6476 for (i = 0, j = 3; i < setnum; i++, j++) {
6477 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6478 if (!obj) {
8f92e768 6479 src[i].dict = NULL;
b287c9bb 6480 } else {
bc000c1d
JC
6481 if (obj->type == REDIS_ZSET) {
6482 src[i].dict = ((zset*)obj->ptr)->dict;
6483 } else if (obj->type == REDIS_SET) {
6484 src[i].dict = (obj->ptr);
6485 } else {
8f92e768 6486 zfree(src);
b287c9bb
PN
6487 addReply(c,shared.wrongtypeerr);
6488 return;
6489 }
b287c9bb 6490 }
2830ca53
PN
6491
6492 /* default all weights to 1 */
8f92e768 6493 src[i].weight = 1.0;
b287c9bb
PN
6494 }
6495
2830ca53
PN
6496 /* parse optional extra arguments */
6497 if (j < c->argc) {
d2764cd6 6498 int remaining = c->argc - j;
b287c9bb 6499
2830ca53 6500 while (remaining) {
bc000c1d 6501 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6502 j++; remaining--;
bc000c1d 6503 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6504 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6505 return;
2830ca53 6506 }
d2764cd6
PN
6507 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6508 j++; remaining--;
6509 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6510 aggregate = REDIS_AGGR_SUM;
6511 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6512 aggregate = REDIS_AGGR_MIN;
6513 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6514 aggregate = REDIS_AGGR_MAX;
6515 } else {
6516 zfree(src);
6517 addReply(c,shared.syntaxerr);
6518 return;
6519 }
6520 j++; remaining--;
2830ca53 6521 } else {
8f92e768 6522 zfree(src);
2830ca53
PN
6523 addReply(c,shared.syntaxerr);
6524 return;
6525 }
6526 }
6527 }
b287c9bb 6528
d2764cd6
PN
6529 /* sort sets from the smallest to largest, this will improve our
6530 * algorithm's performance */
bc000c1d 6531 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6532
2830ca53
PN
6533 dstobj = createZsetObject();
6534 dstzset = dstobj->ptr;
6535
6536 if (op == REDIS_OP_INTER) {
8f92e768
PN
6537 /* skip going over all entries if the smallest zset is NULL or empty */
6538 if (src[0].dict && dictSize(src[0].dict) > 0) {
6539 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6540 * from small to large, all src[i > 0].dict are non-empty too */
6541 di = dictGetIterator(src[0].dict);
2830ca53 6542 while((de = dictNext(di)) != NULL) {
d2764cd6 6543 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6544 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6545
bc000c1d 6546 for (j = 1; j < setnum; j++) {
d2764cd6 6547 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6548 if (other) {
bc000c1d 6549 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6550 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6551 } else {
6552 break;
6553 }
6554 }
b287c9bb 6555
2830ca53 6556 /* skip entry when not present in every source dict */
bc000c1d 6557 if (j != setnum) {
2830ca53
PN
6558 zfree(score);
6559 } else {
6560 robj *o = dictGetEntryKey(de);
6561 dictAdd(dstzset->dict,o,score);
6562 incrRefCount(o); /* added to dictionary */
6563 zslInsert(dstzset->zsl,*score,o);
6564 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6565 }
6566 }
2830ca53
PN
6567 dictReleaseIterator(di);
6568 }
6569 } else if (op == REDIS_OP_UNION) {
bc000c1d 6570 for (i = 0; i < setnum; i++) {
8f92e768 6571 if (!src[i].dict) continue;
2830ca53 6572
8f92e768 6573 di = dictGetIterator(src[i].dict);
2830ca53
PN
6574 while((de = dictNext(di)) != NULL) {
6575 /* skip key when already processed */
6576 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6577
d2764cd6 6578 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6579 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6580
d2764cd6
PN
6581 /* because the zsets are sorted by size, its only possible
6582 * for sets at larger indices to hold this entry */
bc000c1d 6583 for (j = (i+1); j < setnum; j++) {
d2764cd6 6584 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6585 if (other) {
bc000c1d 6586 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6587 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6588 }
6589 }
b287c9bb 6590
2830ca53
PN
6591 robj *o = dictGetEntryKey(de);
6592 dictAdd(dstzset->dict,o,score);
6593 incrRefCount(o); /* added to dictionary */
6594 zslInsert(dstzset->zsl,*score,o);
6595 incrRefCount(o); /* added to skiplist */
6596 }
6597 dictReleaseIterator(di);
b287c9bb 6598 }
2830ca53
PN
6599 } else {
6600 /* unknown operator */
6601 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6602 }
6603
09241813 6604 dbDelete(c->db,dstkey);
3ea27d37 6605 if (dstzset->zsl->length) {
09241813 6606 dbAdd(c->db,dstkey,dstobj);
482b672d 6607 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6608 server.dirty++;
6609 } else {
8bca8773 6610 decrRefCount(dstobj);
3ea27d37 6611 addReply(c, shared.czero);
6612 }
8f92e768 6613 zfree(src);
b287c9bb
PN
6614}
6615
5d373da9 6616static void zunionstoreCommand(redisClient *c) {
2830ca53 6617 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6618}
6619
5d373da9 6620static void zinterstoreCommand(redisClient *c) {
2830ca53 6621 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6622}
6623
e3870fab 6624static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6625 robj *o;
bbe025e0
AM
6626 long start;
6627 long end;
752da584 6628 int withscores = 0;
dd88747b 6629 int llen;
6630 int rangelen, j;
6631 zset *zsetobj;
6632 zskiplist *zsl;
6633 zskiplistNode *ln;
6634 robj *ele;
752da584 6635
bd79a6bd
PN
6636 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6637 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6638
752da584 6639 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6640 withscores = 1;
6641 } else if (c->argc >= 5) {
6642 addReply(c,shared.syntaxerr);
6643 return;
6644 }
cc812361 6645
4e27f268 6646 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6647 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6648 zsetobj = o->ptr;
6649 zsl = zsetobj->zsl;
6650 llen = zsl->length;
cc812361 6651
dd88747b 6652 /* convert negative indexes */
6653 if (start < 0) start = llen+start;
6654 if (end < 0) end = llen+end;
6655 if (start < 0) start = 0;
6656 if (end < 0) end = 0;
cc812361 6657
dd88747b 6658 /* indexes sanity checks */
6659 if (start > end || start >= llen) {
6660 /* Out of range start or start > end result in empty list */
6661 addReply(c,shared.emptymultibulk);
6662 return;
6663 }
6664 if (end >= llen) end = llen-1;
6665 rangelen = (end-start)+1;
cc812361 6666
dd88747b 6667 /* check if starting point is trivial, before searching
6668 * the element in log(N) time */
6669 if (reverse) {
003f0840 6670 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
dd88747b 6671 } else {
6672 ln = start == 0 ?
003f0840 6673 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
dd88747b 6674 }
cc812361 6675
dd88747b 6676 /* Return the result in form of a multi-bulk reply */
6677 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6678 withscores ? (rangelen*2) : rangelen));
6679 for (j = 0; j < rangelen; j++) {
6680 ele = ln->obj;
6681 addReplyBulk(c,ele);
6682 if (withscores)
6683 addReplyDouble(c,ln->score);
6684 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6685 }
6686}
6687
e3870fab 6688static void zrangeCommand(redisClient *c) {
6689 zrangeGenericCommand(c,0);
6690}
6691
6692static void zrevrangeCommand(redisClient *c) {
6693 zrangeGenericCommand(c,1);
6694}
6695
f44dd428 6696/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6697 * If justcount is non-zero, just the count is returned. */
6698static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6699 robj *o;
f44dd428 6700 double min, max;
6701 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6702 int offset = 0, limit = -1;
0500ef27
SH
6703 int withscores = 0;
6704 int badsyntax = 0;
6705
f44dd428 6706 /* Parse the min-max interval. If one of the values is prefixed
6707 * by the "(" character, it's considered "open". For instance
6708 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6709 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6710 if (((char*)c->argv[2]->ptr)[0] == '(') {
6711 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6712 minex = 1;
6713 } else {
6714 min = strtod(c->argv[2]->ptr,NULL);
6715 }
6716 if (((char*)c->argv[3]->ptr)[0] == '(') {
6717 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6718 maxex = 1;
6719 } else {
6720 max = strtod(c->argv[3]->ptr,NULL);
6721 }
6722
6723 /* Parse "WITHSCORES": note that if the command was called with
6724 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6725 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6726 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6727 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6728 withscores = 1;
6729 else
6730 badsyntax = 1;
0500ef27 6731 }
3a3978b1 6732 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6733 badsyntax = 1;
0500ef27 6734 if (badsyntax) {
454d4e43 6735 addReplySds(c,
6736 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6737 return;
0500ef27
SH
6738 }
6739
f44dd428 6740 /* Parse "LIMIT" */
0500ef27 6741 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6742 addReply(c,shared.syntaxerr);
6743 return;
0500ef27 6744 } else if (c->argc == (7 + withscores)) {
80181f78 6745 offset = atoi(c->argv[5]->ptr);
6746 limit = atoi(c->argv[6]->ptr);
0b13687c 6747 if (offset < 0) offset = 0;
80181f78 6748 }
50c55df5 6749
f44dd428 6750 /* Ok, lookup the key and get the range */
50c55df5 6751 o = lookupKeyRead(c->db,c->argv[1]);
6752 if (o == NULL) {
4e27f268 6753 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6754 } else {
6755 if (o->type != REDIS_ZSET) {
6756 addReply(c,shared.wrongtypeerr);
6757 } else {
6758 zset *zsetobj = o->ptr;
6759 zskiplist *zsl = zsetobj->zsl;
6760 zskiplistNode *ln;
f44dd428 6761 robj *ele, *lenobj = NULL;
6762 unsigned long rangelen = 0;
50c55df5 6763
f44dd428 6764 /* Get the first node with the score >= min, or with
6765 * score > min if 'minex' is true. */
50c55df5 6766 ln = zslFirstWithScore(zsl,min);
f44dd428 6767 while (minex && ln && ln->score == min) ln = ln->forward[0];
6768
50c55df5 6769 if (ln == NULL) {
6770 /* No element matching the speciifed interval */
f44dd428 6771 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6772 return;
6773 }
6774
6775 /* We don't know in advance how many matching elements there
6776 * are in the list, so we push this object that will represent
6777 * the multi-bulk length in the output buffer, and will "fix"
6778 * it later */
f44dd428 6779 if (!justcount) {
6780 lenobj = createObject(REDIS_STRING,NULL);
6781 addReply(c,lenobj);
6782 decrRefCount(lenobj);
6783 }
50c55df5 6784
f44dd428 6785 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6786 if (offset) {
6787 offset--;
6788 ln = ln->forward[0];
6789 continue;
6790 }
6791 if (limit == 0) break;
f44dd428 6792 if (!justcount) {
6793 ele = ln->obj;
dd88747b 6794 addReplyBulk(c,ele);
f44dd428 6795 if (withscores)
6796 addReplyDouble(c,ln->score);
6797 }
50c55df5 6798 ln = ln->forward[0];
6799 rangelen++;
80181f78 6800 if (limit > 0) limit--;
50c55df5 6801 }
f44dd428 6802 if (justcount) {
482b672d 6803 addReplyLongLong(c,(long)rangelen);
f44dd428 6804 } else {
6805 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6806 withscores ? (rangelen*2) : rangelen);
6807 }
50c55df5 6808 }
6809 }
6810}
6811
f44dd428 6812static void zrangebyscoreCommand(redisClient *c) {
6813 genericZrangebyscoreCommand(c,0);
6814}
6815
6816static void zcountCommand(redisClient *c) {
6817 genericZrangebyscoreCommand(c,1);
6818}
6819
3c41331e 6820static void zcardCommand(redisClient *c) {
e197b441 6821 robj *o;
6822 zset *zs;
dd88747b 6823
6824 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6825 checkType(c,o,REDIS_ZSET)) return;
6826
6827 zs = o->ptr;
6828 addReplyUlong(c,zs->zsl->length);
e197b441 6829}
6830
6e333bbe 6831static void zscoreCommand(redisClient *c) {
6832 robj *o;
6833 zset *zs;
dd88747b 6834 dictEntry *de;
6835
6836 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6837 checkType(c,o,REDIS_ZSET)) return;
6838
6839 zs = o->ptr;
6840 de = dictFind(zs->dict,c->argv[2]);
6841 if (!de) {
96d8b4ee 6842 addReply(c,shared.nullbulk);
6e333bbe 6843 } else {
dd88747b 6844 double *score = dictGetEntryVal(de);
6e333bbe 6845
dd88747b 6846 addReplyDouble(c,*score);
6e333bbe 6847 }
6848}
6849
798d9e55 6850static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6851 robj *o;
dd88747b 6852 zset *zs;
6853 zskiplist *zsl;
6854 dictEntry *de;
6855 unsigned long rank;
6856 double *score;
6857
6858 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6859 checkType(c,o,REDIS_ZSET)) return;
6860
6861 zs = o->ptr;
6862 zsl = zs->zsl;
6863 de = dictFind(zs->dict,c->argv[2]);
6864 if (!de) {
69d95c3e
PN
6865 addReply(c,shared.nullbulk);
6866 return;
6867 }
69d95c3e 6868
dd88747b 6869 score = dictGetEntryVal(de);
003f0840 6870 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
dd88747b 6871 if (rank) {
6872 if (reverse) {
482b672d 6873 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6874 } else {
482b672d 6875 addReplyLongLong(c, rank-1);
69d95c3e 6876 }
dd88747b 6877 } else {
6878 addReply(c,shared.nullbulk);
978c2c94 6879 }
6880}
6881
798d9e55
PN
6882static void zrankCommand(redisClient *c) {
6883 zrankGenericCommand(c, 0);
6884}
6885
6886static void zrevrankCommand(redisClient *c) {
6887 zrankGenericCommand(c, 1);
6888}
6889
7fb16bac
PN
6890/* ========================= Hashes utility functions ======================= */
6891#define REDIS_HASH_KEY 1
6892#define REDIS_HASH_VALUE 2
978c2c94 6893
7fb16bac
PN
6894/* Check the length of a number of objects to see if we need to convert a
6895 * zipmap to a real hash. Note that we only check string encoded objects
6896 * as their string length can be queried in constant time. */
d1578a33 6897static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
7fb16bac
PN
6898 int i;
6899 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6900
7fb16bac
PN
6901 for (i = start; i <= end; i++) {
6902 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6903 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6904 {
6905 convertToRealHash(subject);
978c2c94 6906 return;
6907 }
6908 }
7fb16bac 6909}
bae2c7ec 6910
97224de7 6911/* Encode given objects in-place when the hash uses a dict. */
d1578a33 6912static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
97224de7 6913 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6914 if (o1) *o1 = tryObjectEncoding(*o1);
6915 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6916 }
6917}
6918
7fb16bac 6919/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6920 * object or NULL if the value cannot be found. The refcount of the object
6921 * is always increased by 1 when the value was found. */
d1578a33 6922static robj *hashTypeGet(robj *o, robj *key) {
7fb16bac 6923 robj *value = NULL;
978c2c94 6924 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6925 unsigned char *v;
6926 unsigned int vlen;
6927 key = getDecodedObject(key);
6928 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6929 value = createStringObject((char*)v,vlen);
6930 }
6931 decrRefCount(key);
6932 } else {
6933 dictEntry *de = dictFind(o->ptr,key);
6934 if (de != NULL) {
6935 value = dictGetEntryVal(de);
a3f3af86 6936 incrRefCount(value);
7fb16bac
PN
6937 }
6938 }
6939 return value;
6940}
978c2c94 6941
7fb16bac
PN
6942/* Test if the key exists in the given hash. Returns 1 if the key
6943 * exists and 0 when it doesn't. */
d1578a33 6944static int hashTypeExists(robj *o, robj *key) {
7fb16bac
PN
6945 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6946 key = getDecodedObject(key);
6947 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6948 decrRefCount(key);
6949 return 1;
6950 }
6951 decrRefCount(key);
6952 } else {
6953 if (dictFind(o->ptr,key) != NULL) {
6954 return 1;
6955 }
6956 }
6957 return 0;
6958}
bae2c7ec 6959
7fb16bac
PN
6960/* Add an element, discard the old if the key already exists.
6961 * Return 0 on insert and 1 on update. */
d1578a33 6962static int hashTypeSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6963 int update = 0;
6964 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6965 key = getDecodedObject(key);
6966 value = getDecodedObject(value);
6967 o->ptr = zipmapSet(o->ptr,
6968 key->ptr,sdslen(key->ptr),
6969 value->ptr,sdslen(value->ptr), &update);
6970 decrRefCount(key);
6971 decrRefCount(value);
6972
6973 /* Check if the zipmap needs to be upgraded to a real hash table */
6974 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6975 convertToRealHash(o);
978c2c94 6976 } else {
7fb16bac
PN
6977 if (dictReplace(o->ptr,key,value)) {
6978 /* Insert */
6979 incrRefCount(key);
978c2c94 6980 } else {
7fb16bac 6981 /* Update */
978c2c94 6982 update = 1;
6983 }
7fb16bac 6984 incrRefCount(value);
978c2c94 6985 }
7fb16bac 6986 return update;
978c2c94 6987}
6988
7fb16bac
PN
6989/* Delete an element from a hash.
6990 * Return 1 on deleted and 0 on not found. */
d1578a33 6991static int hashTypeDelete(robj *o, robj *key) {
7fb16bac
PN
6992 int deleted = 0;
6993 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6994 key = getDecodedObject(key);
6995 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6996 decrRefCount(key);
6997 } else {
6998 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6999 /* Always check if the dictionary needs a resize after a delete. */
7000 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 7001 }
7fb16bac
PN
7002 return deleted;
7003}
d33278d1 7004
7fb16bac 7005/* Return the number of elements in a hash. */
d1578a33 7006static unsigned long hashTypeLength(robj *o) {
7fb16bac
PN
7007 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
7008 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
7009}
7010
7011/* Structure to hold hash iteration abstration. Note that iteration over
7012 * hashes involves both fields and values. Because it is possible that
7013 * not both are required, store pointers in the iterator to avoid
7014 * unnecessary memory allocation for fields/values. */
7015typedef struct {
7016 int encoding;
7017 unsigned char *zi;
7018 unsigned char *zk, *zv;
7019 unsigned int zklen, zvlen;
7020
7021 dictIterator *di;
7022 dictEntry *de;
d1578a33 7023} hashTypeIterator;
7fb16bac 7024
d1578a33
PN
7025static hashTypeIterator *hashTypeInitIterator(robj *subject) {
7026 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
7fb16bac
PN
7027 hi->encoding = subject->encoding;
7028 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7029 hi->zi = zipmapRewind(subject->ptr);
7030 } else if (hi->encoding == REDIS_ENCODING_HT) {
7031 hi->di = dictGetIterator(subject->ptr);
d33278d1 7032 } else {
7fb16bac 7033 redisAssert(NULL);
d33278d1 7034 }
c44d3b56 7035 return hi;
7fb16bac 7036}
d33278d1 7037
d1578a33 7038static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7fb16bac
PN
7039 if (hi->encoding == REDIS_ENCODING_HT) {
7040 dictReleaseIterator(hi->di);
d33278d1 7041 }
c44d3b56 7042 zfree(hi);
7fb16bac 7043}
d33278d1 7044
7fb16bac
PN
7045/* Move to the next entry in the hash. Return REDIS_OK when the next entry
7046 * could be found and REDIS_ERR when the iterator reaches the end. */
d1578a33 7047static int hashTypeNext(hashTypeIterator *hi) {
7fb16bac
PN
7048 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7049 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7050 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7051 } else {
7052 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7053 }
7054 return REDIS_OK;
7055}
d33278d1 7056
0c390abc 7057/* Get key or value object at current iteration position.
a3f3af86 7058 * This increases the refcount of the field object by 1. */
d1578a33 7059static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7fb16bac
PN
7060 robj *o;
7061 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7062 if (what & REDIS_HASH_KEY) {
7063 o = createStringObject((char*)hi->zk,hi->zklen);
7064 } else {
7065 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 7066 }
d33278d1 7067 } else {
7fb16bac
PN
7068 if (what & REDIS_HASH_KEY) {
7069 o = dictGetEntryKey(hi->de);
7070 } else {
7071 o = dictGetEntryVal(hi->de);
d33278d1 7072 }
a3f3af86 7073 incrRefCount(o);
d33278d1 7074 }
7fb16bac 7075 return o;
d33278d1
PN
7076}
7077
d1578a33 7078static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7fb16bac 7079 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
7080 if (o == NULL) {
7081 o = createHashObject();
09241813 7082 dbAdd(c->db,key,o);
01426b05
PN
7083 } else {
7084 if (o->type != REDIS_HASH) {
7085 addReply(c,shared.wrongtypeerr);
7fb16bac 7086 return NULL;
01426b05
PN
7087 }
7088 }
7fb16bac
PN
7089 return o;
7090}
01426b05 7091
7fb16bac
PN
7092/* ============================= Hash commands ============================== */
7093static void hsetCommand(redisClient *c) {
6e9e463f 7094 int update;
7fb16bac 7095 robj *o;
bbe025e0 7096
d1578a33
PN
7097 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7098 hashTypeTryConversion(o,c->argv,2,3);
7099 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7100 update = hashTypeSet(o,c->argv[2],c->argv[3]);
6e9e463f 7101 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
7102 server.dirty++;
7103}
01426b05 7104
1f1c7695
PN
7105static void hsetnxCommand(redisClient *c) {
7106 robj *o;
d1578a33
PN
7107 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7108 hashTypeTryConversion(o,c->argv,2,3);
1f1c7695 7109
d1578a33 7110 if (hashTypeExists(o, c->argv[2])) {
1f1c7695 7111 addReply(c, shared.czero);
01426b05 7112 } else {
d1578a33
PN
7113 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7114 hashTypeSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
7115 addReply(c, shared.cone);
7116 server.dirty++;
7117 }
7118}
01426b05 7119
7fb16bac
PN
7120static void hmsetCommand(redisClient *c) {
7121 int i;
7122 robj *o;
01426b05 7123
7fb16bac
PN
7124 if ((c->argc % 2) == 1) {
7125 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7126 return;
7127 }
01426b05 7128
d1578a33
PN
7129 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7130 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7fb16bac 7131 for (i = 2; i < c->argc; i += 2) {
d1578a33
PN
7132 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7133 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
7134 }
7135 addReply(c, shared.ok);
edc2f63a 7136 server.dirty++;
7fb16bac
PN
7137}
7138
7139static void hincrbyCommand(redisClient *c) {
7140 long long value, incr;
7141 robj *o, *current, *new;
7142
bd79a6bd 7143 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
d1578a33
PN
7144 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7145 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
946342c1
PN
7146 if (getLongLongFromObjectOrReply(c,current,&value,
7147 "hash value is not an integer") != REDIS_OK) {
7148 decrRefCount(current);
7149 return;
7150 }
a3f3af86 7151 decrRefCount(current);
7fb16bac
PN
7152 } else {
7153 value = 0;
01426b05
PN
7154 }
7155
7fb16bac 7156 value += incr;
3f973463 7157 new = createStringObjectFromLongLong(value);
d1578a33
PN
7158 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7159 hashTypeSet(o,c->argv[2],new);
7fb16bac
PN
7160 decrRefCount(new);
7161 addReplyLongLong(c,value);
01426b05 7162 server.dirty++;
01426b05
PN
7163}
7164
978c2c94 7165static void hgetCommand(redisClient *c) {
7fb16bac 7166 robj *o, *value;
dd88747b 7167 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7168 checkType(c,o,REDIS_HASH)) return;
7169
d1578a33 7170 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7fb16bac 7171 addReplyBulk(c,value);
a3f3af86 7172 decrRefCount(value);
dd88747b 7173 } else {
7fb16bac 7174 addReply(c,shared.nullbulk);
69d95c3e 7175 }
69d95c3e
PN
7176}
7177
09aeb579
PN
7178static void hmgetCommand(redisClient *c) {
7179 int i;
7fb16bac
PN
7180 robj *o, *value;
7181 o = lookupKeyRead(c->db,c->argv[1]);
7182 if (o != NULL && o->type != REDIS_HASH) {
7183 addReply(c,shared.wrongtypeerr);
09aeb579
PN
7184 }
7185
7fb16bac
PN
7186 /* Note the check for o != NULL happens inside the loop. This is
7187 * done because objects that cannot be found are considered to be
7188 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 7189 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac 7190 for (i = 2; i < c->argc; i++) {
d1578a33 7191 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7fb16bac 7192 addReplyBulk(c,value);
a3f3af86 7193 decrRefCount(value);
7fb16bac
PN
7194 } else {
7195 addReply(c,shared.nullbulk);
09aeb579
PN
7196 }
7197 }
7198}
7199
07efaf74 7200static void hdelCommand(redisClient *c) {
dd88747b 7201 robj *o;
dd88747b 7202 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7203 checkType(c,o,REDIS_HASH)) return;
07efaf74 7204
d1578a33
PN
7205 if (hashTypeDelete(o,c->argv[2])) {
7206 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7fb16bac
PN
7207 addReply(c,shared.cone);
7208 server.dirty++;
dd88747b 7209 } else {
7fb16bac 7210 addReply(c,shared.czero);
07efaf74 7211 }
7212}
7213
92b27fe9 7214static void hlenCommand(redisClient *c) {
7215 robj *o;
dd88747b 7216 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 7217 checkType(c,o,REDIS_HASH)) return;
7218
d1578a33 7219 addReplyUlong(c,hashTypeLength(o));
92b27fe9 7220}
7221
78409a0f 7222static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 7223 robj *o, *lenobj, *obj;
78409a0f 7224 unsigned long count = 0;
d1578a33 7225 hashTypeIterator *hi;
78409a0f 7226
4e27f268 7227 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 7228 || checkType(c,o,REDIS_HASH)) return;
7229
7230 lenobj = createObject(REDIS_STRING,NULL);
7231 addReply(c,lenobj);
7232 decrRefCount(lenobj);
7233
d1578a33
PN
7234 hi = hashTypeInitIterator(o);
7235 while (hashTypeNext(hi) != REDIS_ERR) {
7fb16bac 7236 if (flags & REDIS_HASH_KEY) {
d1578a33 7237 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7fb16bac 7238 addReplyBulk(c,obj);
a3f3af86 7239 decrRefCount(obj);
7fb16bac 7240 count++;
78409a0f 7241 }
7fb16bac 7242 if (flags & REDIS_HASH_VALUE) {
d1578a33 7243 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 7244 addReplyBulk(c,obj);
a3f3af86 7245 decrRefCount(obj);
7fb16bac 7246 count++;
78409a0f 7247 }
78409a0f 7248 }
d1578a33 7249 hashTypeReleaseIterator(hi);
7fb16bac 7250
78409a0f 7251 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7252}
7253
7254static void hkeysCommand(redisClient *c) {
7fb16bac 7255 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 7256}
7257
7258static void hvalsCommand(redisClient *c) {
7fb16bac 7259 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 7260}
7261
7262static void hgetallCommand(redisClient *c) {
7fb16bac 7263 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 7264}
7265
a86f14b1 7266static void hexistsCommand(redisClient *c) {
7267 robj *o;
a86f14b1 7268 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7269 checkType(c,o,REDIS_HASH)) return;
7270
d1578a33 7271 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 7272}
7273
ada386b2 7274static void convertToRealHash(robj *o) {
7275 unsigned char *key, *val, *p, *zm = o->ptr;
7276 unsigned int klen, vlen;
7277 dict *dict = dictCreate(&hashDictType,NULL);
7278
7279 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7280 p = zipmapRewind(zm);
7281 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7282 robj *keyobj, *valobj;
7283
7284 keyobj = createStringObject((char*)key,klen);
7285 valobj = createStringObject((char*)val,vlen);
05df7621 7286 keyobj = tryObjectEncoding(keyobj);
7287 valobj = tryObjectEncoding(valobj);
ada386b2 7288 dictAdd(dict,keyobj,valobj);
7289 }
7290 o->encoding = REDIS_ENCODING_HT;
7291 o->ptr = dict;
7292 zfree(zm);
7293}
7294
6b47e12e 7295/* ========================= Non type-specific commands ==================== */
7296
ed9b544e 7297static void flushdbCommand(redisClient *c) {
ca37e9cd 7298 server.dirty += dictSize(c->db->dict);
9b30e1a2 7299 touchWatchedKeysOnFlush(c->db->id);
3305306f 7300 dictEmpty(c->db->dict);
7301 dictEmpty(c->db->expires);
ed9b544e 7302 addReply(c,shared.ok);
ed9b544e 7303}
7304
7305static void flushallCommand(redisClient *c) {
9b30e1a2 7306 touchWatchedKeysOnFlush(-1);
ca37e9cd 7307 server.dirty += emptyDb();
ed9b544e 7308 addReply(c,shared.ok);
500ece7c 7309 if (server.bgsavechildpid != -1) {
7310 kill(server.bgsavechildpid,SIGKILL);
7311 rdbRemoveTempFile(server.bgsavechildpid);
7312 }
f78fd11b 7313 rdbSave(server.dbfilename);
ca37e9cd 7314 server.dirty++;
ed9b544e 7315}
7316
56906eef 7317static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 7318 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 7319 so->type = type;
7320 so->pattern = pattern;
7321 return so;
7322}
7323
7324/* Return the value associated to the key with a name obtained
55017f9d
PN
7325 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7326 * The returned object will always have its refcount increased by 1
7327 * when it is non-NULL. */
56906eef 7328static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 7329 char *p, *f;
ed9b544e 7330 sds spat, ssub;
6d7d1370
PN
7331 robj keyobj, fieldobj, *o;
7332 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 7333 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7334 struct {
f1017b3f 7335 long len;
7336 long free;
ed9b544e 7337 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 7338 } keyname, fieldname;
ed9b544e 7339
28173a49 7340 /* If the pattern is "#" return the substitution object itself in order
7341 * to implement the "SORT ... GET #" feature. */
7342 spat = pattern->ptr;
7343 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 7344 incrRefCount(subst);
28173a49 7345 return subst;
7346 }
7347
7348 /* The substitution object may be specially encoded. If so we create
9d65a1bb 7349 * a decoded object on the fly. Otherwise getDecodedObject will just
7350 * increment the ref count, that we'll decrement later. */
7351 subst = getDecodedObject(subst);
942a3961 7352
ed9b544e 7353 ssub = subst->ptr;
7354 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7355 p = strchr(spat,'*');
ed5a857a 7356 if (!p) {
7357 decrRefCount(subst);
7358 return NULL;
7359 }
ed9b544e 7360
6d7d1370
PN
7361 /* Find out if we're dealing with a hash dereference. */
7362 if ((f = strstr(p+1, "->")) != NULL) {
7363 fieldlen = sdslen(spat)-(f-spat);
7364 /* this also copies \0 character */
7365 memcpy(fieldname.buf,f+2,fieldlen-1);
7366 fieldname.len = fieldlen-2;
7367 } else {
7368 fieldlen = 0;
7369 }
7370
ed9b544e 7371 prefixlen = p-spat;
7372 sublen = sdslen(ssub);
6d7d1370 7373 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 7374 memcpy(keyname.buf,spat,prefixlen);
7375 memcpy(keyname.buf+prefixlen,ssub,sublen);
7376 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7377 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7378 keyname.len = prefixlen+sublen+postfixlen;
942a3961 7379 decrRefCount(subst);
7380
6d7d1370
PN
7381 /* Lookup substituted key */
7382 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7383 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
7384 if (o == NULL) return NULL;
7385
7386 if (fieldlen > 0) {
7387 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 7388
705dad38
PN
7389 /* Retrieve value from hash by the field name. This operation
7390 * already increases the refcount of the returned object. */
6d7d1370 7391 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
d1578a33 7392 o = hashTypeGet(o, &fieldobj);
705dad38 7393 } else {
55017f9d 7394 if (o->type != REDIS_STRING) return NULL;
b6f07345 7395
705dad38
PN
7396 /* Every object that this function returns needs to have its refcount
7397 * increased. sortCommand decreases it again. */
7398 incrRefCount(o);
6d7d1370
PN
7399 }
7400
7401 return o;
ed9b544e 7402}
7403
7404/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7405 * the additional parameter is not standard but a BSD-specific we have to
7406 * pass sorting parameters via the global 'server' structure */
7407static int sortCompare(const void *s1, const void *s2) {
7408 const redisSortObject *so1 = s1, *so2 = s2;
7409 int cmp;
7410
7411 if (!server.sort_alpha) {
7412 /* Numeric sorting. Here it's trivial as we precomputed scores */
7413 if (so1->u.score > so2->u.score) {
7414 cmp = 1;
7415 } else if (so1->u.score < so2->u.score) {
7416 cmp = -1;
7417 } else {
7418 cmp = 0;
7419 }
7420 } else {
7421 /* Alphanumeric sorting */
7422 if (server.sort_bypattern) {
7423 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7424 /* At least one compare object is NULL */
7425 if (so1->u.cmpobj == so2->u.cmpobj)
7426 cmp = 0;
7427 else if (so1->u.cmpobj == NULL)
7428 cmp = -1;
7429 else
7430 cmp = 1;
7431 } else {
7432 /* We have both the objects, use strcoll */
7433 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7434 }
7435 } else {
08ee9b57 7436 /* Compare elements directly. */
7437 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 7438 }
7439 }
7440 return server.sort_desc ? -cmp : cmp;
7441}
7442
7443/* The SORT command is the most complex command in Redis. Warning: this code
7444 * is optimized for speed and a bit less for readability */
7445static void sortCommand(redisClient *c) {
ed9b544e 7446 list *operations;
a03611e1 7447 unsigned int outputlen = 0;
ed9b544e 7448 int desc = 0, alpha = 0;
7449 int limit_start = 0, limit_count = -1, start, end;
7450 int j, dontsort = 0, vectorlen;
7451 int getop = 0; /* GET operation counter */
443c6409 7452 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 7453 redisSortObject *vector; /* Resulting vector to sort */
7454
7455 /* Lookup the key to sort. It must be of the right types */
3305306f 7456 sortval = lookupKeyRead(c->db,c->argv[1]);
7457 if (sortval == NULL) {
4e27f268 7458 addReply(c,shared.emptymultibulk);
ed9b544e 7459 return;
7460 }
a5eb649b 7461 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7462 sortval->type != REDIS_ZSET)
7463 {
c937aa89 7464 addReply(c,shared.wrongtypeerr);
ed9b544e 7465 return;
7466 }
7467
7468 /* Create a list of operations to perform for every sorted element.
7469 * Operations can be GET/DEL/INCR/DECR */
7470 operations = listCreate();
092dac2a 7471 listSetFreeMethod(operations,zfree);
ed9b544e 7472 j = 2;
7473
7474 /* Now we need to protect sortval incrementing its count, in the future
7475 * SORT may have options able to overwrite/delete keys during the sorting
7476 * and the sorted key itself may get destroied */
7477 incrRefCount(sortval);
7478
7479 /* The SORT command has an SQL-alike syntax, parse it */
7480 while(j < c->argc) {
7481 int leftargs = c->argc-j-1;
7482 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7483 desc = 0;
7484 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7485 desc = 1;
7486 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7487 alpha = 1;
7488 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7489 limit_start = atoi(c->argv[j+1]->ptr);
7490 limit_count = atoi(c->argv[j+2]->ptr);
7491 j+=2;
443c6409 7492 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7493 storekey = c->argv[j+1];
7494 j++;
ed9b544e 7495 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7496 sortby = c->argv[j+1];
7497 /* If the BY pattern does not contain '*', i.e. it is constant,
7498 * we don't need to sort nor to lookup the weight keys. */
7499 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7500 j++;
7501 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7502 listAddNodeTail(operations,createSortOperation(
7503 REDIS_SORT_GET,c->argv[j+1]));
7504 getop++;
7505 j++;
ed9b544e 7506 } else {
7507 decrRefCount(sortval);
7508 listRelease(operations);
c937aa89 7509 addReply(c,shared.syntaxerr);
ed9b544e 7510 return;
7511 }
7512 j++;
7513 }
7514
7515 /* Load the sorting vector with all the objects to sort */
a5eb649b 7516 switch(sortval->type) {
003f0840 7517 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
a5eb649b 7518 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7519 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7520 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7521 }
ed9b544e 7522 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7523 j = 0;
a5eb649b 7524
ed9b544e 7525 if (sortval->type == REDIS_LIST) {
003f0840
PN
7526 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7527 listTypeEntry entry;
7528 while(listTypeNext(li,&entry)) {
7529 vector[j].obj = listTypeGet(&entry);
ed9b544e 7530 vector[j].u.score = 0;
7531 vector[j].u.cmpobj = NULL;
ed9b544e 7532 j++;
7533 }
003f0840 7534 listTypeReleaseIterator(li);
ed9b544e 7535 } else {
a5eb649b 7536 dict *set;
ed9b544e 7537 dictIterator *di;
7538 dictEntry *setele;
7539
a5eb649b 7540 if (sortval->type == REDIS_SET) {
7541 set = sortval->ptr;
7542 } else {
7543 zset *zs = sortval->ptr;
7544 set = zs->dict;
7545 }
7546
ed9b544e 7547 di = dictGetIterator(set);
ed9b544e 7548 while((setele = dictNext(di)) != NULL) {
7549 vector[j].obj = dictGetEntryKey(setele);
7550 vector[j].u.score = 0;
7551 vector[j].u.cmpobj = NULL;
7552 j++;
7553 }
7554 dictReleaseIterator(di);
7555 }
dfc5e96c 7556 redisAssert(j == vectorlen);
ed9b544e 7557
7558 /* Now it's time to load the right scores in the sorting vector */
7559 if (dontsort == 0) {
7560 for (j = 0; j < vectorlen; j++) {
6d7d1370 7561 robj *byval;
ed9b544e 7562 if (sortby) {
6d7d1370 7563 /* lookup value to sort by */
3305306f 7564 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7565 if (!byval) continue;
ed9b544e 7566 } else {
6d7d1370
PN
7567 /* use object itself to sort by */
7568 byval = vector[j].obj;
7569 }
7570
7571 if (alpha) {
08ee9b57 7572 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7573 } else {
7574 if (byval->encoding == REDIS_ENCODING_RAW) {
7575 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7576 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7577 /* Don't need to decode the object if it's
7578 * integer-encoded (the only encoding supported) so
7579 * far. We can just cast it */
16fa22f1
PN
7580 vector[j].u.score = (long)byval->ptr;
7581 } else {
7582 redisAssert(1 != 1);
942a3961 7583 }
ed9b544e 7584 }
6d7d1370 7585
705dad38
PN
7586 /* when the object was retrieved using lookupKeyByPattern,
7587 * its refcount needs to be decreased. */
7588 if (sortby) {
7589 decrRefCount(byval);
ed9b544e 7590 }
7591 }
7592 }
7593
7594 /* We are ready to sort the vector... perform a bit of sanity check
7595 * on the LIMIT option too. We'll use a partial version of quicksort. */
7596 start = (limit_start < 0) ? 0 : limit_start;
7597 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7598 if (start >= vectorlen) {
7599 start = vectorlen-1;
7600 end = vectorlen-2;
7601 }
7602 if (end >= vectorlen) end = vectorlen-1;
7603
7604 if (dontsort == 0) {
7605 server.sort_desc = desc;
7606 server.sort_alpha = alpha;
7607 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7608 if (sortby && (start != 0 || end != vectorlen-1))
7609 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7610 else
7611 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7612 }
7613
7614 /* Send command output to the output buffer, performing the specified
7615 * GET/DEL/INCR/DECR operations if any. */
7616 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7617 if (storekey == NULL) {
7618 /* STORE option not specified, sent the sorting result to client */
7619 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7620 for (j = start; j <= end; j++) {
7621 listNode *ln;
c7df85a4 7622 listIter li;
7623
dd88747b 7624 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7625 listRewind(operations,&li);
7626 while((ln = listNext(&li))) {
443c6409 7627 redisSortOperation *sop = ln->value;
7628 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7629 vector[j].obj);
7630
7631 if (sop->type == REDIS_SORT_GET) {
55017f9d 7632 if (!val) {
443c6409 7633 addReply(c,shared.nullbulk);
7634 } else {
dd88747b 7635 addReplyBulk(c,val);
55017f9d 7636 decrRefCount(val);
443c6409 7637 }
7638 } else {
dfc5e96c 7639 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7640 }
7641 }
ed9b544e 7642 }
443c6409 7643 } else {
74e0f445 7644 robj *sobj = createZiplistObject();
443c6409 7645
7646 /* STORE option specified, set the sorting result as a List object */
7647 for (j = start; j <= end; j++) {
7648 listNode *ln;
c7df85a4 7649 listIter li;
7650
443c6409 7651 if (!getop) {
003f0840 7652 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
a03611e1
PN
7653 } else {
7654 listRewind(operations,&li);
7655 while((ln = listNext(&li))) {
7656 redisSortOperation *sop = ln->value;
7657 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7658 vector[j].obj);
7659
7660 if (sop->type == REDIS_SORT_GET) {
7661 if (!val) val = createStringObject("",0);
7662
003f0840 7663 /* listTypePush does an incrRefCount, so we should take care
a03611e1
PN
7664 * care of the incremented refcount caused by either
7665 * lookupKeyByPattern or createStringObject("",0) */
003f0840 7666 listTypePush(sobj,val,REDIS_TAIL);
a03611e1 7667 decrRefCount(val);
443c6409 7668 } else {
a03611e1
PN
7669 /* always fails */
7670 redisAssert(sop->type == REDIS_SORT_GET);
443c6409 7671 }
ed9b544e 7672 }
ed9b544e 7673 }
ed9b544e 7674 }
846d8b3e 7675 dbReplace(c->db,storekey,sobj);
443c6409 7676 /* Note: we add 1 because the DB is dirty anyway since even if the
7677 * SORT result is empty a new key is set and maybe the old content
7678 * replaced. */
7679 server.dirty += 1+outputlen;
7680 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7681 }
7682
7683 /* Cleanup */
a03611e1
PN
7684 if (sortval->type == REDIS_LIST)
7685 for (j = 0; j < vectorlen; j++)
7686 decrRefCount(vector[j].obj);
ed9b544e 7687 decrRefCount(sortval);
7688 listRelease(operations);
7689 for (j = 0; j < vectorlen; j++) {
16fa22f1 7690 if (alpha && vector[j].u.cmpobj)
ed9b544e 7691 decrRefCount(vector[j].u.cmpobj);
7692 }
7693 zfree(vector);
7694}
7695
ec6c7a1d 7696/* Convert an amount of bytes into a human readable string in the form
7697 * of 100B, 2G, 100M, 4K, and so forth. */
7698static void bytesToHuman(char *s, unsigned long long n) {
7699 double d;
7700
7701 if (n < 1024) {
7702 /* Bytes */
7703 sprintf(s,"%lluB",n);
7704 return;
7705 } else if (n < (1024*1024)) {
7706 d = (double)n/(1024);
7707 sprintf(s,"%.2fK",d);
7708 } else if (n < (1024LL*1024*1024)) {
7709 d = (double)n/(1024*1024);
7710 sprintf(s,"%.2fM",d);
7711 } else if (n < (1024LL*1024*1024*1024)) {
7712 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7713 sprintf(s,"%.2fG",d);
ec6c7a1d 7714 }
7715}
7716
1c85b79f 7717/* Create the string returned by the INFO command. This is decoupled
7718 * by the INFO command itself as we need to report the same information
7719 * on memory corruption problems. */
7720static sds genRedisInfoString(void) {
ed9b544e 7721 sds info;
7722 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7723 int j;
ec6c7a1d 7724 char hmem[64];
55a8298f 7725
b72f6a4b 7726 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7727 info = sdscatprintf(sdsempty(),
7728 "redis_version:%s\r\n"
5436146c
PN
7729 "redis_git_sha1:%s\r\n"
7730 "redis_git_dirty:%d\r\n"
f1017b3f 7731 "arch_bits:%s\r\n"
7a932b74 7732 "multiplexing_api:%s\r\n"
0d7170a4 7733 "process_id:%ld\r\n"
682ac724 7734 "uptime_in_seconds:%ld\r\n"
7735 "uptime_in_days:%ld\r\n"
ed9b544e 7736 "connected_clients:%d\r\n"
7737 "connected_slaves:%d\r\n"
f86a74e9 7738 "blocked_clients:%d\r\n"
5fba9f71 7739 "used_memory:%zu\r\n"
ec6c7a1d 7740 "used_memory_human:%s\r\n"
ed9b544e 7741 "changes_since_last_save:%lld\r\n"
be2bb6b0 7742 "bgsave_in_progress:%d\r\n"
682ac724 7743 "last_save_time:%ld\r\n"
b3fad521 7744 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7745 "total_connections_received:%lld\r\n"
7746 "total_commands_processed:%lld\r\n"
2a6a2ed1 7747 "expired_keys:%lld\r\n"
3be2c9d7 7748 "hash_max_zipmap_entries:%zu\r\n"
7749 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7750 "pubsub_channels:%ld\r\n"
7751 "pubsub_patterns:%u\r\n"
7d98e08c 7752 "vm_enabled:%d\r\n"
a0f643ea 7753 "role:%s\r\n"
ed9b544e 7754 ,REDIS_VERSION,
5436146c 7755 REDIS_GIT_SHA1,
274e45e3 7756 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7757 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7758 aeGetApiName(),
0d7170a4 7759 (long) getpid(),
a0f643ea 7760 uptime,
7761 uptime/(3600*24),
ed9b544e 7762 listLength(server.clients)-listLength(server.slaves),
7763 listLength(server.slaves),
d5d55fc3 7764 server.blpop_blocked_clients,
b72f6a4b 7765 zmalloc_used_memory(),
ec6c7a1d 7766 hmem,
ed9b544e 7767 server.dirty,
9d65a1bb 7768 server.bgsavechildpid != -1,
ed9b544e 7769 server.lastsave,
b3fad521 7770 server.bgrewritechildpid != -1,
ed9b544e 7771 server.stat_numconnections,
7772 server.stat_numcommands,
2a6a2ed1 7773 server.stat_expiredkeys,
55a8298f 7774 server.hash_max_zipmap_entries,
7775 server.hash_max_zipmap_value,
ffc6b7f8 7776 dictSize(server.pubsub_channels),
7777 listLength(server.pubsub_patterns),
7d98e08c 7778 server.vm_enabled != 0,
a0f643ea 7779 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7780 );
a0f643ea 7781 if (server.masterhost) {
7782 info = sdscatprintf(info,
7783 "master_host:%s\r\n"
7784 "master_port:%d\r\n"
7785 "master_link_status:%s\r\n"
7786 "master_last_io_seconds_ago:%d\r\n"
7787 ,server.masterhost,
7788 server.masterport,
7789 (server.replstate == REDIS_REPL_CONNECTED) ?
7790 "up" : "down",
f72b934d 7791 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7792 );
7793 }
7d98e08c 7794 if (server.vm_enabled) {
1064ef87 7795 lockThreadedIO();
7d98e08c 7796 info = sdscatprintf(info,
7797 "vm_conf_max_memory:%llu\r\n"
7798 "vm_conf_page_size:%llu\r\n"
7799 "vm_conf_pages:%llu\r\n"
7800 "vm_stats_used_pages:%llu\r\n"
7801 "vm_stats_swapped_objects:%llu\r\n"
7802 "vm_stats_swappin_count:%llu\r\n"
7803 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7804 "vm_stats_io_newjobs_len:%lu\r\n"
7805 "vm_stats_io_processing_len:%lu\r\n"
7806 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7807 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7808 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7809 ,(unsigned long long) server.vm_max_memory,
7810 (unsigned long long) server.vm_page_size,
7811 (unsigned long long) server.vm_pages,
7812 (unsigned long long) server.vm_stats_used_pages,
7813 (unsigned long long) server.vm_stats_swapped_objects,
7814 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7815 (unsigned long long) server.vm_stats_swapouts,
7816 (unsigned long) listLength(server.io_newjobs),
7817 (unsigned long) listLength(server.io_processing),
7818 (unsigned long) listLength(server.io_processed),
d5d55fc3 7819 (unsigned long) server.io_active_threads,
7820 (unsigned long) server.vm_blocked_clients
7d98e08c 7821 );
1064ef87 7822 unlockThreadedIO();
7d98e08c 7823 }
c3cb078d 7824 for (j = 0; j < server.dbnum; j++) {
7825 long long keys, vkeys;
7826
7827 keys = dictSize(server.db[j].dict);
7828 vkeys = dictSize(server.db[j].expires);
7829 if (keys || vkeys) {
9d65a1bb 7830 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7831 j, keys, vkeys);
7832 }
7833 }
1c85b79f 7834 return info;
7835}
7836
7837static void infoCommand(redisClient *c) {
7838 sds info = genRedisInfoString();
83c6a618 7839 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7840 (unsigned long)sdslen(info)));
ed9b544e 7841 addReplySds(c,info);
70003d28 7842 addReply(c,shared.crlf);
ed9b544e 7843}
7844
3305306f 7845static void monitorCommand(redisClient *c) {
7846 /* ignore MONITOR if aleady slave or in monitor mode */
7847 if (c->flags & REDIS_SLAVE) return;
7848
7849 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7850 c->slaveseldb = 0;
6b47e12e 7851 listAddNodeTail(server.monitors,c);
3305306f 7852 addReply(c,shared.ok);
7853}
7854
7855/* ================================= Expire ================================= */
7856static int removeExpire(redisDb *db, robj *key) {
829137b9
PN
7857 /* An expire may only be removed if there is a corresponding entry in the
7858 * main dict. Otherwise, the key will never be freed. */
7859 redisAssert(dictFind(db->dict,key->ptr) != NULL);
09241813 7860 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
3305306f 7861 return 1;
7862 } else {
7863 return 0;
7864 }
7865}
7866
7867static int setExpire(redisDb *db, robj *key, time_t when) {
829137b9
PN
7868 dictEntry *de;
7869
7870 /* Reuse the sds from the main dict in the expire dict */
7871 redisAssert((de = dictFind(db->dict,key->ptr)) != NULL);
7872 if (dictAdd(db->expires,dictGetEntryKey(de),(void*)when) == DICT_ERR) {
3305306f 7873 return 0;
7874 } else {
3305306f 7875 return 1;
7876 }
7877}
7878
bb32ede5 7879/* Return the expire time of the specified key, or -1 if no expire
7880 * is associated with this key (i.e. the key is non volatile) */
7881static time_t getExpire(redisDb *db, robj *key) {
7882 dictEntry *de;
7883
7884 /* No expire? return ASAP */
7885 if (dictSize(db->expires) == 0 ||
09241813 7886 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
bb32ede5 7887
829137b9
PN
7888 /* The entry was found in the expire dict, this means it should also
7889 * be present in the main dict (safety check). */
7890 redisAssert(dictFind(db->dict,key->ptr) != NULL);
bb32ede5 7891 return (time_t) dictGetEntryVal(de);
7892}
7893
3305306f 7894static int expireIfNeeded(redisDb *db, robj *key) {
829137b9
PN
7895 time_t when = getExpire(db,key);
7896 if (when < 0) return 0;
3305306f 7897
829137b9 7898 /* Return when this key has not expired */
3305306f 7899 if (time(NULL) <= when) return 0;
7900
7901 /* Delete the key */
2a6a2ed1 7902 server.stat_expiredkeys++;
829137b9
PN
7903 server.dirty++;
7904 return dbDelete(db,key);
3305306f 7905}
7906
7907static int deleteIfVolatile(redisDb *db, robj *key) {
829137b9 7908 if (getExpire(db,key) < 0) return 0;
3305306f 7909
7910 /* Delete the key */
2a6a2ed1 7911 server.stat_expiredkeys++;
829137b9
PN
7912 server.dirty++;
7913 return dbDelete(db,key);
3305306f 7914}
7915
bbe025e0 7916static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7917 dictEntry *de;
bbe025e0
AM
7918 time_t seconds;
7919
bd79a6bd 7920 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7921
7922 seconds -= offset;
3305306f 7923
09241813 7924 de = dictFind(c->db->dict,key->ptr);
3305306f 7925 if (de == NULL) {
7926 addReply(c,shared.czero);
7927 return;
7928 }
d4dd6556 7929 if (seconds <= 0) {
09241813 7930 if (dbDelete(c->db,key)) server.dirty++;
43e5ccdf 7931 addReply(c, shared.cone);
3305306f 7932 return;
7933 } else {
7934 time_t when = time(NULL)+seconds;
802e8373 7935 if (setExpire(c->db,key,when)) {
3305306f 7936 addReply(c,shared.cone);
77423026 7937 server.dirty++;
7938 } else {
3305306f 7939 addReply(c,shared.czero);
77423026 7940 }
3305306f 7941 return;
7942 }
7943}
7944
802e8373 7945static void expireCommand(redisClient *c) {
bbe025e0 7946 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7947}
7948
7949static void expireatCommand(redisClient *c) {
bbe025e0 7950 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7951}
7952
fd88489a 7953static void ttlCommand(redisClient *c) {
7954 time_t expire;
7955 int ttl = -1;
7956
7957 expire = getExpire(c->db,c->argv[1]);
7958 if (expire != -1) {
7959 ttl = (int) (expire-time(NULL));
7960 if (ttl < 0) ttl = -1;
7961 }
7962 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7963}
7964
6e469882 7965/* ================================ MULTI/EXEC ============================== */
7966
7967/* Client state initialization for MULTI/EXEC */
7968static void initClientMultiState(redisClient *c) {
7969 c->mstate.commands = NULL;
7970 c->mstate.count = 0;
7971}
7972
7973/* Release all the resources associated with MULTI/EXEC state */
7974static void freeClientMultiState(redisClient *c) {
7975 int j;
7976
7977 for (j = 0; j < c->mstate.count; j++) {
7978 int i;
7979 multiCmd *mc = c->mstate.commands+j;
7980
7981 for (i = 0; i < mc->argc; i++)
7982 decrRefCount(mc->argv[i]);
7983 zfree(mc->argv);
7984 }
7985 zfree(c->mstate.commands);
7986}
7987
7988/* Add a new command into the MULTI commands queue */
7989static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7990 multiCmd *mc;
7991 int j;
7992
7993 c->mstate.commands = zrealloc(c->mstate.commands,
7994 sizeof(multiCmd)*(c->mstate.count+1));
7995 mc = c->mstate.commands+c->mstate.count;
7996 mc->cmd = cmd;
7997 mc->argc = c->argc;
7998 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7999 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
8000 for (j = 0; j < c->argc; j++)
8001 incrRefCount(mc->argv[j]);
8002 c->mstate.count++;
8003}
8004
8005static void multiCommand(redisClient *c) {
6531c94d 8006 if (c->flags & REDIS_MULTI) {
8007 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
8008 return;
8009 }
6e469882 8010 c->flags |= REDIS_MULTI;
36c548f0 8011 addReply(c,shared.ok);
6e469882 8012}
8013
18b6cb76
DJ
8014static void discardCommand(redisClient *c) {
8015 if (!(c->flags & REDIS_MULTI)) {
8016 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
8017 return;
8018 }
8019
8020 freeClientMultiState(c);
8021 initClientMultiState(c);
8022 c->flags &= (~REDIS_MULTI);
a2645226 8023 unwatchAllKeys(c);
18b6cb76
DJ
8024 addReply(c,shared.ok);
8025}
8026
66c8853f 8027/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
8028 * implememntation for more information. */
8029static void execCommandReplicateMulti(redisClient *c) {
8030 struct redisCommand *cmd;
8031 robj *multistring = createStringObject("MULTI",5);
8032
8033 cmd = lookupCommand("multi");
8034 if (server.appendonly)
8035 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8036 if (listLength(server.slaves))
8037 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8038 decrRefCount(multistring);
8039}
8040
6e469882 8041static void execCommand(redisClient *c) {
8042 int j;
8043 robj **orig_argv;
8044 int orig_argc;
8045
8046 if (!(c->flags & REDIS_MULTI)) {
8047 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8048 return;
8049 }
8050
37ab76c9 8051 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8052 * A failed EXEC will return a multi bulk nil object. */
8053 if (c->flags & REDIS_DIRTY_CAS) {
8054 freeClientMultiState(c);
8055 initClientMultiState(c);
8056 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8057 unwatchAllKeys(c);
8058 addReply(c,shared.nullmultibulk);
8059 return;
8060 }
8061
66c8853f 8062 /* Replicate a MULTI request now that we are sure the block is executed.
8063 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8064 * both the AOF and the replication link will have the same consistency
8065 * and atomicity guarantees. */
8066 execCommandReplicateMulti(c);
8067
8068 /* Exec all the queued commands */
1ad4d316 8069 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 8070 orig_argv = c->argv;
8071 orig_argc = c->argc;
8072 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8073 for (j = 0; j < c->mstate.count; j++) {
8074 c->argc = c->mstate.commands[j].argc;
8075 c->argv = c->mstate.commands[j].argv;
8076 call(c,c->mstate.commands[j].cmd);
8077 }
8078 c->argv = orig_argv;
8079 c->argc = orig_argc;
8080 freeClientMultiState(c);
8081 initClientMultiState(c);
1ad4d316 8082 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 8083 /* Make sure the EXEC command is always replicated / AOF, since we
8084 * always send the MULTI command (we can't know beforehand if the
8085 * next operations will contain at least a modification to the DB). */
8086 server.dirty++;
6e469882 8087}
8088
4409877e 8089/* =========================== Blocking Operations ========================= */
8090
8091/* Currently Redis blocking operations support is limited to list POP ops,
8092 * so the current implementation is not fully generic, but it is also not
8093 * completely specific so it will not require a rewrite to support new
8094 * kind of blocking operations in the future.
8095 *
8096 * Still it's important to note that list blocking operations can be already
8097 * used as a notification mechanism in order to implement other blocking
8098 * operations at application level, so there must be a very strong evidence
8099 * of usefulness and generality before new blocking operations are implemented.
8100 *
8101 * This is how the current blocking POP works, we use BLPOP as example:
8102 * - If the user calls BLPOP and the key exists and contains a non empty list
8103 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8104 * if there is not to block.
8105 * - If instead BLPOP is called and the key does not exists or the list is
8106 * empty we need to block. In order to do so we remove the notification for
8107 * new data to read in the client socket (so that we'll not serve new
8108 * requests if the blocking request is not served). Also we put the client
37ab76c9 8109 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 8110 * blocking for this keys.
8111 * - If a PUSH operation against a key with blocked clients waiting is
8112 * performed, we serve the first in the list: basically instead to push
8113 * the new element inside the list we return it to the (first / oldest)
8114 * blocking client, unblock the client, and remove it form the list.
8115 *
8116 * The above comment and the source code should be enough in order to understand
8117 * the implementation and modify / fix it later.
8118 */
8119
8120/* Set a client in blocking mode for the specified key, with the specified
8121 * timeout */
b177fd30 8122static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 8123 dictEntry *de;
8124 list *l;
b177fd30 8125 int j;
4409877e 8126
37ab76c9 8127 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8128 c->blocking_keys_num = numkeys;
4409877e 8129 c->blockingto = timeout;
b177fd30 8130 for (j = 0; j < numkeys; j++) {
8131 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 8132 c->blocking_keys[j] = keys[j];
b177fd30 8133 incrRefCount(keys[j]);
4409877e 8134
b177fd30 8135 /* And in the other "side", to map keys -> clients */
37ab76c9 8136 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 8137 if (de == NULL) {
8138 int retval;
8139
8140 /* For every key we take a list of clients blocked for it */
8141 l = listCreate();
37ab76c9 8142 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 8143 incrRefCount(keys[j]);
8144 assert(retval == DICT_OK);
8145 } else {
8146 l = dictGetEntryVal(de);
8147 }
8148 listAddNodeTail(l,c);
4409877e 8149 }
b177fd30 8150 /* Mark the client as a blocked client */
4409877e 8151 c->flags |= REDIS_BLOCKED;
d5d55fc3 8152 server.blpop_blocked_clients++;
4409877e 8153}
8154
8155/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 8156static void unblockClientWaitingData(redisClient *c) {
4409877e 8157 dictEntry *de;
8158 list *l;
b177fd30 8159 int j;
4409877e 8160
37ab76c9 8161 assert(c->blocking_keys != NULL);
b177fd30 8162 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 8163 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 8164 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 8165 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 8166 assert(de != NULL);
8167 l = dictGetEntryVal(de);
8168 listDelNode(l,listSearchKey(l,c));
8169 /* If the list is empty we need to remove it to avoid wasting memory */
8170 if (listLength(l) == 0)
37ab76c9 8171 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8172 decrRefCount(c->blocking_keys[j]);
b177fd30 8173 }
8174 /* Cleanup the client structure */
37ab76c9 8175 zfree(c->blocking_keys);
8176 c->blocking_keys = NULL;
4409877e 8177 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 8178 server.blpop_blocked_clients--;
5921aa36 8179 /* We want to process data if there is some command waiting
b0d8747d 8180 * in the input buffer. Note that this is safe even if
8181 * unblockClientWaitingData() gets called from freeClient() because
8182 * freeClient() will be smart enough to call this function
8183 * *after* c->querybuf was set to NULL. */
4409877e 8184 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8185}
8186
8187/* This should be called from any function PUSHing into lists.
8188 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8189 * 'ele' is the element pushed.
8190 *
8191 * If the function returns 0 there was no client waiting for a list push
8192 * against this key.
8193 *
8194 * If the function returns 1 there was a client waiting for a list push
8195 * against this key, the element was passed to this client thus it's not
8196 * needed to actually add it to the list and the caller should return asap. */
8197static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8198 struct dictEntry *de;
8199 redisClient *receiver;
8200 list *l;
8201 listNode *ln;
8202
37ab76c9 8203 de = dictFind(c->db->blocking_keys,key);
4409877e 8204 if (de == NULL) return 0;
8205 l = dictGetEntryVal(de);
8206 ln = listFirst(l);
8207 assert(ln != NULL);
8208 receiver = ln->value;
4409877e 8209
b177fd30 8210 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 8211 addReplyBulk(receiver,key);
8212 addReplyBulk(receiver,ele);
b0d8747d 8213 unblockClientWaitingData(receiver);
4409877e 8214 return 1;
8215}
8216
8217/* Blocking RPOP/LPOP */
8218static void blockingPopGenericCommand(redisClient *c, int where) {
8219 robj *o;
8220 time_t timeout;
b177fd30 8221 int j;
4409877e 8222
b177fd30 8223 for (j = 1; j < c->argc-1; j++) {
8224 o = lookupKeyWrite(c->db,c->argv[j]);
8225 if (o != NULL) {
8226 if (o->type != REDIS_LIST) {
8227 addReply(c,shared.wrongtypeerr);
4409877e 8228 return;
b177fd30 8229 } else {
8230 list *list = o->ptr;
8231 if (listLength(list) != 0) {
8232 /* If the list contains elements fall back to the usual
8233 * non-blocking POP operation */
8234 robj *argv[2], **orig_argv;
8235 int orig_argc;
e0a62c7f 8236
b177fd30 8237 /* We need to alter the command arguments before to call
8238 * popGenericCommand() as the command takes a single key. */
8239 orig_argv = c->argv;
8240 orig_argc = c->argc;
8241 argv[1] = c->argv[j];
8242 c->argv = argv;
8243 c->argc = 2;
8244
8245 /* Also the return value is different, we need to output
8246 * the multi bulk reply header and the key name. The
8247 * "real" command will add the last element (the value)
8248 * for us. If this souds like an hack to you it's just
8249 * because it is... */
8250 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 8251 addReplyBulk(c,argv[1]);
b177fd30 8252 popGenericCommand(c,where);
8253
8254 /* Fix the client structure with the original stuff */
8255 c->argv = orig_argv;
8256 c->argc = orig_argc;
8257 return;
8258 }
4409877e 8259 }
8260 }
8261 }
8262 /* If the list is empty or the key does not exists we must block */
b177fd30 8263 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 8264 if (timeout > 0) timeout += time(NULL);
b177fd30 8265 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 8266}
8267
8268static void blpopCommand(redisClient *c) {
8269 blockingPopGenericCommand(c,REDIS_HEAD);
8270}
8271
8272static void brpopCommand(redisClient *c) {
8273 blockingPopGenericCommand(c,REDIS_TAIL);
8274}
8275
ed9b544e 8276/* =============================== Replication ============================= */
8277
a4d1ba9a 8278static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8279 ssize_t nwritten, ret = size;
8280 time_t start = time(NULL);
8281
8282 timeout++;
8283 while(size) {
8284 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8285 nwritten = write(fd,ptr,size);
8286 if (nwritten == -1) return -1;
8287 ptr += nwritten;
8288 size -= nwritten;
8289 }
8290 if ((time(NULL)-start) > timeout) {
8291 errno = ETIMEDOUT;
8292 return -1;
8293 }
8294 }
8295 return ret;
8296}
8297
a4d1ba9a 8298static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8299 ssize_t nread, totread = 0;
8300 time_t start = time(NULL);
8301
8302 timeout++;
8303 while(size) {
8304 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8305 nread = read(fd,ptr,size);
8306 if (nread == -1) return -1;
8307 ptr += nread;
8308 size -= nread;
8309 totread += nread;
8310 }
8311 if ((time(NULL)-start) > timeout) {
8312 errno = ETIMEDOUT;
8313 return -1;
8314 }
8315 }
8316 return totread;
8317}
8318
8319static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8320 ssize_t nread = 0;
8321
8322 size--;
8323 while(size) {
8324 char c;
8325
8326 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8327 if (c == '\n') {
8328 *ptr = '\0';
8329 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8330 return nread;
8331 } else {
8332 *ptr++ = c;
8333 *ptr = '\0';
8334 nread++;
8335 }
8336 }
8337 return nread;
8338}
8339
8340static void syncCommand(redisClient *c) {
40d224a9 8341 /* ignore SYNC if aleady slave or in monitor mode */
8342 if (c->flags & REDIS_SLAVE) return;
8343
8344 /* SYNC can't be issued when the server has pending data to send to
8345 * the client about already issued commands. We need a fresh reply
8346 * buffer registering the differences between the BGSAVE and the current
8347 * dataset, so that we can copy to other slaves if needed. */
8348 if (listLength(c->reply) != 0) {
8349 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8350 return;
8351 }
8352
8353 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8354 /* Here we need to check if there is a background saving operation
8355 * in progress, or if it is required to start one */
9d65a1bb 8356 if (server.bgsavechildpid != -1) {
40d224a9 8357 /* Ok a background save is in progress. Let's check if it is a good
8358 * one for replication, i.e. if there is another slave that is
8359 * registering differences since the server forked to save */
8360 redisClient *slave;
8361 listNode *ln;
c7df85a4 8362 listIter li;
40d224a9 8363
c7df85a4 8364 listRewind(server.slaves,&li);
8365 while((ln = listNext(&li))) {
40d224a9 8366 slave = ln->value;
8367 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 8368 }
8369 if (ln) {
8370 /* Perfect, the server is already registering differences for
8371 * another slave. Set the right state, and copy the buffer. */
8372 listRelease(c->reply);
8373 c->reply = listDup(slave->reply);
40d224a9 8374 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8375 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8376 } else {
8377 /* No way, we need to wait for the next BGSAVE in order to
8378 * register differences */
8379 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8380 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8381 }
8382 } else {
8383 /* Ok we don't have a BGSAVE in progress, let's start one */
8384 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8385 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8386 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8387 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8388 return;
8389 }
8390 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8391 }
6208b3a7 8392 c->repldbfd = -1;
40d224a9 8393 c->flags |= REDIS_SLAVE;
8394 c->slaveseldb = 0;
6b47e12e 8395 listAddNodeTail(server.slaves,c);
40d224a9 8396 return;
8397}
8398
6208b3a7 8399static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8400 redisClient *slave = privdata;
8401 REDIS_NOTUSED(el);
8402 REDIS_NOTUSED(mask);
8403 char buf[REDIS_IOBUF_LEN];
8404 ssize_t nwritten, buflen;
8405
8406 if (slave->repldboff == 0) {
8407 /* Write the bulk write count before to transfer the DB. In theory here
8408 * we don't know how much room there is in the output buffer of the
8409 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8410 * operations) will never be smaller than the few bytes we need. */
8411 sds bulkcount;
8412
8413 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8414 slave->repldbsize);
8415 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8416 {
8417 sdsfree(bulkcount);
8418 freeClient(slave);
8419 return;
8420 }
8421 sdsfree(bulkcount);
8422 }
8423 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8424 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8425 if (buflen <= 0) {
8426 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8427 (buflen == 0) ? "premature EOF" : strerror(errno));
8428 freeClient(slave);
8429 return;
8430 }
8431 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 8432 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 8433 strerror(errno));
8434 freeClient(slave);
8435 return;
8436 }
8437 slave->repldboff += nwritten;
8438 if (slave->repldboff == slave->repldbsize) {
8439 close(slave->repldbfd);
8440 slave->repldbfd = -1;
8441 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8442 slave->replstate = REDIS_REPL_ONLINE;
8443 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 8444 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 8445 freeClient(slave);
8446 return;
8447 }
8448 addReplySds(slave,sdsempty());
8449 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8450 }
8451}
ed9b544e 8452
a3b21203 8453/* This function is called at the end of every backgrond saving.
8454 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8455 * otherwise REDIS_ERR is passed to the function.
8456 *
8457 * The goal of this function is to handle slaves waiting for a successful
8458 * background saving in order to perform non-blocking synchronization. */
8459static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 8460 listNode *ln;
8461 int startbgsave = 0;
c7df85a4 8462 listIter li;
ed9b544e 8463
c7df85a4 8464 listRewind(server.slaves,&li);
8465 while((ln = listNext(&li))) {
6208b3a7 8466 redisClient *slave = ln->value;
ed9b544e 8467
6208b3a7 8468 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8469 startbgsave = 1;
8470 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8471 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 8472 struct redis_stat buf;
e0a62c7f 8473
6208b3a7 8474 if (bgsaveerr != REDIS_OK) {
8475 freeClient(slave);
8476 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8477 continue;
8478 }
8479 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 8480 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 8481 freeClient(slave);
8482 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8483 continue;
8484 }
8485 slave->repldboff = 0;
8486 slave->repldbsize = buf.st_size;
8487 slave->replstate = REDIS_REPL_SEND_BULK;
8488 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 8489 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 8490 freeClient(slave);
8491 continue;
8492 }
8493 }
ed9b544e 8494 }
6208b3a7 8495 if (startbgsave) {
8496 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8497 listIter li;
8498
8499 listRewind(server.slaves,&li);
6208b3a7 8500 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8501 while((ln = listNext(&li))) {
6208b3a7 8502 redisClient *slave = ln->value;
ed9b544e 8503
6208b3a7 8504 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8505 freeClient(slave);
8506 }
8507 }
8508 }
ed9b544e 8509}
8510
8511static int syncWithMaster(void) {
d0ccebcf 8512 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8513 long dumpsize;
ed9b544e 8514 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8515 int dfd, maxtries = 5;
ed9b544e 8516
8517 if (fd == -1) {
8518 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8519 strerror(errno));
8520 return REDIS_ERR;
8521 }
d0ccebcf 8522
8523 /* AUTH with the master if required. */
8524 if(server.masterauth) {
8525 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8526 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8527 close(fd);
8528 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8529 strerror(errno));
8530 return REDIS_ERR;
8531 }
8532 /* Read the AUTH result. */
8533 if (syncReadLine(fd,buf,1024,3600) == -1) {
8534 close(fd);
8535 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8536 strerror(errno));
8537 return REDIS_ERR;
8538 }
8539 if (buf[0] != '+') {
8540 close(fd);
8541 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8542 return REDIS_ERR;
8543 }
8544 }
8545
ed9b544e 8546 /* Issue the SYNC command */
8547 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8548 close(fd);
8549 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8550 strerror(errno));
8551 return REDIS_ERR;
8552 }
8553 /* Read the bulk write count */
8c4d91fc 8554 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8555 close(fd);
8556 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8557 strerror(errno));
8558 return REDIS_ERR;
8559 }
4aa701c1 8560 if (buf[0] != '$') {
8561 close(fd);
8562 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8563 return REDIS_ERR;
8564 }
18e61fa2 8565 dumpsize = strtol(buf+1,NULL,10);
8566 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8567 /* Read the bulk write data on a temp file */
8c5abee8 8568 while(maxtries--) {
8569 snprintf(tmpfile,256,
8570 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8571 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8572 if (dfd != -1) break;
5de9ad7c 8573 sleep(1);
8c5abee8 8574 }
ed9b544e 8575 if (dfd == -1) {
8576 close(fd);
8577 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8578 return REDIS_ERR;
8579 }
8580 while(dumpsize) {
8581 int nread, nwritten;
8582
8583 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8584 if (nread == -1) {
8585 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8586 strerror(errno));
8587 close(fd);
8588 close(dfd);
8589 return REDIS_ERR;
8590 }
8591 nwritten = write(dfd,buf,nread);
8592 if (nwritten == -1) {
8593 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8594 close(fd);
8595 close(dfd);
8596 return REDIS_ERR;
8597 }
8598 dumpsize -= nread;
8599 }
8600 close(dfd);
8601 if (rename(tmpfile,server.dbfilename) == -1) {
8602 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8603 unlink(tmpfile);
8604 close(fd);
8605 return REDIS_ERR;
8606 }
8607 emptyDb();
f78fd11b 8608 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8609 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8610 close(fd);
8611 return REDIS_ERR;
8612 }
8613 server.master = createClient(fd);
8614 server.master->flags |= REDIS_MASTER;
179b3952 8615 server.master->authenticated = 1;
ed9b544e 8616 server.replstate = REDIS_REPL_CONNECTED;
8617 return REDIS_OK;
8618}
8619
321b0e13 8620static void slaveofCommand(redisClient *c) {
8621 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8622 !strcasecmp(c->argv[2]->ptr,"one")) {
8623 if (server.masterhost) {
8624 sdsfree(server.masterhost);
8625 server.masterhost = NULL;
8626 if (server.master) freeClient(server.master);
8627 server.replstate = REDIS_REPL_NONE;
8628 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8629 }
8630 } else {
8631 sdsfree(server.masterhost);
8632 server.masterhost = sdsdup(c->argv[1]->ptr);
8633 server.masterport = atoi(c->argv[2]->ptr);
8634 if (server.master) freeClient(server.master);
8635 server.replstate = REDIS_REPL_CONNECT;
8636 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8637 server.masterhost, server.masterport);
8638 }
8639 addReply(c,shared.ok);
8640}
8641
3fd78bcd 8642/* ============================ Maxmemory directive ======================== */
8643
a5819310 8644/* Try to free one object form the pre-allocated objects free list.
8645 * This is useful under low mem conditions as by default we take 1 million
8646 * free objects allocated. On success REDIS_OK is returned, otherwise
8647 * REDIS_ERR. */
8648static int tryFreeOneObjectFromFreelist(void) {
f870935d 8649 robj *o;
8650
a5819310 8651 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8652 if (listLength(server.objfreelist)) {
8653 listNode *head = listFirst(server.objfreelist);
8654 o = listNodeValue(head);
8655 listDelNode(server.objfreelist,head);
8656 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8657 zfree(o);
8658 return REDIS_OK;
8659 } else {
8660 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8661 return REDIS_ERR;
8662 }
f870935d 8663}
8664
3fd78bcd 8665/* This function gets called when 'maxmemory' is set on the config file to limit
8666 * the max memory used by the server, and we are out of memory.
8667 * This function will try to, in order:
8668 *
8669 * - Free objects from the free list
8670 * - Try to remove keys with an EXPIRE set
8671 *
8672 * It is not possible to free enough memory to reach used-memory < maxmemory
8673 * the server will start refusing commands that will enlarge even more the
8674 * memory usage.
8675 */
8676static void freeMemoryIfNeeded(void) {
8677 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8678 int j, k, freed = 0;
8679
8680 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8681 for (j = 0; j < server.dbnum; j++) {
8682 int minttl = -1;
8683 robj *minkey = NULL;
8684 struct dictEntry *de;
8685
8686 if (dictSize(server.db[j].expires)) {
8687 freed = 1;
8688 /* From a sample of three keys drop the one nearest to
8689 * the natural expire */
8690 for (k = 0; k < 3; k++) {
8691 time_t t;
8692
8693 de = dictGetRandomKey(server.db[j].expires);
8694 t = (time_t) dictGetEntryVal(de);
8695 if (minttl == -1 || t < minttl) {
8696 minkey = dictGetEntryKey(de);
8697 minttl = t;
3fd78bcd 8698 }
3fd78bcd 8699 }
09241813 8700 dbDelete(server.db+j,minkey);
3fd78bcd 8701 }
3fd78bcd 8702 }
a5819310 8703 if (!freed) return; /* nothing to free... */
3fd78bcd 8704 }
8705}
8706
f80dff62 8707/* ============================== Append Only file ========================== */
8708
560db612 8709/* Called when the user switches from "appendonly yes" to "appendonly no"
8710 * at runtime using the CONFIG command. */
8711static void stopAppendOnly(void) {
8712 flushAppendOnlyFile();
8713 aof_fsync(server.appendfd);
8714 close(server.appendfd);
8715
8716 server.appendfd = -1;
8717 server.appendseldb = -1;
8718 server.appendonly = 0;
8719 /* rewrite operation in progress? kill it, wait child exit */
8720 if (server.bgsavechildpid != -1) {
8721 int statloc;
8722
8723 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8724 wait3(&statloc,0,NULL);
8725 /* reset the buffer accumulating changes while the child saves */
8726 sdsfree(server.bgrewritebuf);
8727 server.bgrewritebuf = sdsempty();
8728 server.bgsavechildpid = -1;
8729 }
8730}
8731
8732/* Called when the user switches from "appendonly no" to "appendonly yes"
8733 * at runtime using the CONFIG command. */
8734static int startAppendOnly(void) {
8735 server.appendonly = 1;
8736 server.lastfsync = time(NULL);
8737 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8738 if (server.appendfd == -1) {
8739 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8740 return REDIS_ERR;
8741 }
8742 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8743 server.appendonly = 0;
8744 close(server.appendfd);
8745 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8746 return REDIS_ERR;
8747 }
8748 return REDIS_OK;
8749}
8750
28ed1f33 8751/* Write the append only file buffer on disk.
8752 *
8753 * Since we are required to write the AOF before replying to the client,
8754 * and the only way the client socket can get a write is entering when the
8755 * the event loop, we accumulate all the AOF writes in a memory
8756 * buffer and write it on disk using this function just before entering
8757 * the event loop again. */
8758static void flushAppendOnlyFile(void) {
8759 time_t now;
8760 ssize_t nwritten;
8761
8762 if (sdslen(server.aofbuf) == 0) return;
8763
8764 /* We want to perform a single write. This should be guaranteed atomic
8765 * at least if the filesystem we are writing is a real physical one.
8766 * While this will save us against the server being killed I don't think
8767 * there is much to do about the whole server stopping for power problems
8768 * or alike */
8769 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8770 if (nwritten != (signed)sdslen(server.aofbuf)) {
8771 /* Ooops, we are in troubles. The best thing to do for now is
8772 * aborting instead of giving the illusion that everything is
8773 * working as expected. */
8774 if (nwritten == -1) {
8775 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8776 } else {
8777 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8778 }
8779 exit(1);
8780 }
8781 sdsfree(server.aofbuf);
8782 server.aofbuf = sdsempty();
8783
38db9171 8784 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8785 * childs performing heavy I/O on disk. */
8786 if (server.no_appendfsync_on_rewrite &&
8787 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8788 return;
28ed1f33 8789 /* Fsync if needed */
8790 now = time(NULL);
8791 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8792 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8793 now-server.lastfsync > 1))
8794 {
8795 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8796 * flushing metadata. */
8797 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8798 server.lastfsync = now;
8799 }
8800}
8801
9376e434
PN
8802static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8803 int j;
8804 buf = sdscatprintf(buf,"*%d\r\n",argc);
8805 for (j = 0; j < argc; j++) {
8806 robj *o = getDecodedObject(argv[j]);
8807 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8808 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8809 buf = sdscatlen(buf,"\r\n",2);
8810 decrRefCount(o);
8811 }
8812 return buf;
8813}
8814
8815static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8816 int argc = 3;
8817 long when;
8818 robj *argv[3];
8819
8820 /* Make sure we can use strtol */
8821 seconds = getDecodedObject(seconds);
8822 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8823 decrRefCount(seconds);
8824
8825 argv[0] = createStringObject("EXPIREAT",8);
8826 argv[1] = key;
8827 argv[2] = createObject(REDIS_STRING,
8828 sdscatprintf(sdsempty(),"%ld",when));
8829 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8830 decrRefCount(argv[0]);
8831 decrRefCount(argv[2]);
8832 return buf;
8833}
8834
f80dff62 8835static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8836 sds buf = sdsempty();
f80dff62 8837 robj *tmpargv[3];
8838
8839 /* The DB this command was targetting is not the same as the last command
8840 * we appendend. To issue a SELECT command is needed. */
8841 if (dictid != server.appendseldb) {
8842 char seldb[64];
8843
8844 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8845 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8846 (unsigned long)strlen(seldb),seldb);
f80dff62 8847 server.appendseldb = dictid;
8848 }
8849
f80dff62 8850 if (cmd->proc == expireCommand) {
9376e434
PN
8851 /* Translate EXPIRE into EXPIREAT */
8852 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8853 } else if (cmd->proc == setexCommand) {
8854 /* Translate SETEX to SET and EXPIREAT */
8855 tmpargv[0] = createStringObject("SET",3);
f80dff62 8856 tmpargv[1] = argv[1];
9376e434
PN
8857 tmpargv[2] = argv[3];
8858 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8859 decrRefCount(tmpargv[0]);
8860 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8861 } else {
8862 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8863 }
8864
28ed1f33 8865 /* Append to the AOF buffer. This will be flushed on disk just before
8866 * of re-entering the event loop, so before the client will get a
8867 * positive reply about the operation performed. */
8868 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8869
85a83172 8870 /* If a background append only file rewriting is in progress we want to
8871 * accumulate the differences between the child DB and the current one
8872 * in a buffer, so that when the child process will do its work we
8873 * can append the differences to the new append only file. */
8874 if (server.bgrewritechildpid != -1)
8875 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8876
8877 sdsfree(buf);
f80dff62 8878}
8879
8880/* In Redis commands are always executed in the context of a client, so in
8881 * order to load the append only file we need to create a fake client. */
8882static struct redisClient *createFakeClient(void) {
8883 struct redisClient *c = zmalloc(sizeof(*c));
8884
8885 selectDb(c,0);
8886 c->fd = -1;
8887 c->querybuf = sdsempty();
8888 c->argc = 0;
8889 c->argv = NULL;
8890 c->flags = 0;
9387d17d 8891 /* We set the fake client as a slave waiting for the synchronization
8892 * so that Redis will not try to send replies to this client. */
8893 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8894 c->reply = listCreate();
8895 listSetFreeMethod(c->reply,decrRefCount);
8896 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8897 initClientMultiState(c);
f80dff62 8898 return c;
8899}
8900
8901static void freeFakeClient(struct redisClient *c) {
8902 sdsfree(c->querybuf);
8903 listRelease(c->reply);
4132ad8d 8904 freeClientMultiState(c);
f80dff62 8905 zfree(c);
8906}
8907
8908/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8909 * error (the append only file is zero-length) REDIS_ERR is returned. On
8910 * fatal error an error message is logged and the program exists. */
8911int loadAppendOnlyFile(char *filename) {
8912 struct redisClient *fakeClient;
8913 FILE *fp = fopen(filename,"r");
8914 struct redis_stat sb;
4132ad8d 8915 int appendonly = server.appendonly;
f80dff62 8916
8917 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8918 return REDIS_ERR;
8919
8920 if (fp == NULL) {
8921 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8922 exit(1);
8923 }
8924
4132ad8d
PN
8925 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8926 * to the same file we're about to read. */
8927 server.appendonly = 0;
8928
f80dff62 8929 fakeClient = createFakeClient();
8930 while(1) {
8931 int argc, j;
8932 unsigned long len;
8933 robj **argv;
8934 char buf[128];
8935 sds argsds;
8936 struct redisCommand *cmd;
a89b7013 8937 int force_swapout;
f80dff62 8938
8939 if (fgets(buf,sizeof(buf),fp) == NULL) {
8940 if (feof(fp))
8941 break;
8942 else
8943 goto readerr;
8944 }
8945 if (buf[0] != '*') goto fmterr;
8946 argc = atoi(buf+1);
8947 argv = zmalloc(sizeof(robj*)*argc);
8948 for (j = 0; j < argc; j++) {
8949 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8950 if (buf[0] != '$') goto fmterr;
8951 len = strtol(buf+1,NULL,10);
8952 argsds = sdsnewlen(NULL,len);
0f151ef1 8953 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8954 argv[j] = createObject(REDIS_STRING,argsds);
8955 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8956 }
8957
8958 /* Command lookup */
8959 cmd = lookupCommand(argv[0]->ptr);
8960 if (!cmd) {
8961 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8962 exit(1);
8963 }
bdcb92f2 8964 /* Try object encoding */
f80dff62 8965 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8966 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8967 /* Run the command in the context of a fake client */
8968 fakeClient->argc = argc;
8969 fakeClient->argv = argv;
8970 cmd->proc(fakeClient);
8971 /* Discard the reply objects list from the fake client */
8972 while(listLength(fakeClient->reply))
8973 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8974 /* Clean up, ready for the next command */
8975 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8976 zfree(argv);
b492cf00 8977 /* Handle swapping while loading big datasets when VM is on */
a89b7013 8978 force_swapout = 0;
8979 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
8980 force_swapout = 1;
8981
8982 if (server.vm_enabled && force_swapout) {
b492cf00 8983 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8984 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8985 }
8986 }
f80dff62 8987 }
4132ad8d
PN
8988
8989 /* This point can only be reached when EOF is reached without errors.
8990 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8991 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8992
f80dff62 8993 fclose(fp);
8994 freeFakeClient(fakeClient);
4132ad8d 8995 server.appendonly = appendonly;
f80dff62 8996 return REDIS_OK;
8997
8998readerr:
8999 if (feof(fp)) {
9000 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
9001 } else {
9002 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
9003 }
9004 exit(1);
9005fmterr:
9006 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
9007 exit(1);
9008}
9009
9c8e3cee 9010/* Write binary-safe string into a file in the bulkformat
9011 * $<count>\r\n<payload>\r\n */
9012static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
9eaef89f
PN
9013 char cbuf[128];
9014 int clen;
9015 cbuf[0] = '$';
9016 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
9017 cbuf[clen++] = '\r';
9018 cbuf[clen++] = '\n';
9019 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
9020 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9c8e3cee 9021 if (fwrite("\r\n",2,1,fp) == 0) return 0;
9022 return 1;
9023}
9024
9d65a1bb 9025/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
9026static int fwriteBulkDouble(FILE *fp, double d) {
9027 char buf[128], dbuf[128];
9028
9029 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
9030 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
9031 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
9032 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
9033 return 1;
9034}
9035
9036/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9eaef89f
PN
9037static int fwriteBulkLongLong(FILE *fp, long long l) {
9038 char bbuf[128], lbuf[128];
9039 unsigned int blen, llen;
9040 llen = ll2string(lbuf,32,l);
9041 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9042 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9d65a1bb 9043 return 1;
9044}
9045
9eaef89f
PN
9046/* Delegate writing an object to writing a bulk string or bulk long long. */
9047static int fwriteBulkObject(FILE *fp, robj *obj) {
9048 /* Avoid using getDecodedObject to help copy-on-write (we are often
9049 * in a child process when this function is called). */
9050 if (obj->encoding == REDIS_ENCODING_INT) {
9051 return fwriteBulkLongLong(fp,(long)obj->ptr);
9052 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9053 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9054 } else {
9055 redisPanic("Unknown string encoding");
9056 }
9057}
9058
9d65a1bb 9059/* Write a sequence of commands able to fully rebuild the dataset into
9060 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9061static int rewriteAppendOnlyFile(char *filename) {
9062 dictIterator *di = NULL;
9063 dictEntry *de;
9064 FILE *fp;
9065 char tmpfile[256];
9066 int j;
9067 time_t now = time(NULL);
9068
9069 /* Note that we have to use a different temp name here compared to the
9070 * one used by rewriteAppendOnlyFileBackground() function. */
9071 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9072 fp = fopen(tmpfile,"w");
9073 if (!fp) {
9074 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9075 return REDIS_ERR;
9076 }
9077 for (j = 0; j < server.dbnum; j++) {
9078 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9079 redisDb *db = server.db+j;
9080 dict *d = db->dict;
9081 if (dictSize(d) == 0) continue;
9082 di = dictGetIterator(d);
9083 if (!di) {
9084 fclose(fp);
9085 return REDIS_ERR;
9086 }
9087
9088 /* SELECT the new DB */
9089 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9eaef89f 9090 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9d65a1bb 9091
9092 /* Iterate this DB writing every entry */
9093 while((de = dictNext(di)) != NULL) {
09241813 9094 sds keystr = dictGetEntryKey(de);
9095 robj key, *o;
e7546c63 9096 time_t expiretime;
9097 int swapped;
9098
09241813 9099 keystr = dictGetEntryKey(de);
560db612 9100 o = dictGetEntryVal(de);
09241813 9101 initStaticStringObject(key,keystr);
b9bc0eef 9102 /* If the value for this key is swapped, load a preview in memory.
9103 * We use a "swapped" flag to remember if we need to free the
9104 * value object instead to just increment the ref count anyway
9105 * in order to avoid copy-on-write of pages if we are forked() */
560db612 9106 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9107 o->storage == REDIS_VM_SWAPPING) {
e7546c63 9108 swapped = 0;
9109 } else {
560db612 9110 o = vmPreviewObject(o);
e7546c63 9111 swapped = 1;
9112 }
09241813 9113 expiretime = getExpire(db,&key);
9d65a1bb 9114
9115 /* Save the key and associated value */
9d65a1bb 9116 if (o->type == REDIS_STRING) {
9117 /* Emit a SET command */
9118 char cmd[]="*3\r\n$3\r\nSET\r\n";
9119 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9120 /* Key and value */
09241813 9121 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9122 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 9123 } else if (o->type == REDIS_LIST) {
9124 /* Emit the RPUSHes needed to rebuild the list */
6ddc908a
PN
9125 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9126 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9127 unsigned char *zl = o->ptr;
9128 unsigned char *p = ziplistIndex(zl,0);
9129 unsigned char *vstr;
9130 unsigned int vlen;
9131 long long vlong;
9132
9133 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9134 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
846d8b3e 9135 if (fwriteBulkObject(fp,&key) == 0) goto werr;
6ddc908a
PN
9136 if (vstr) {
9137 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9138 goto werr;
9139 } else {
9140 if (fwriteBulkLongLong(fp,vlong) == 0)
9141 goto werr;
9142 }
9143 p = ziplistNext(zl,p);
9144 }
9145 } else if (o->encoding == REDIS_ENCODING_LIST) {
9146 list *list = o->ptr;
9147 listNode *ln;
9148 listIter li;
9149
9150 listRewind(list,&li);
9151 while((ln = listNext(&li))) {
9152 robj *eleobj = listNodeValue(ln);
9153
9154 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
846d8b3e 9155 if (fwriteBulkObject(fp,&key) == 0) goto werr;
6ddc908a
PN
9156 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9157 }
9158 } else {
9159 redisPanic("Unknown list encoding");
9d65a1bb 9160 }
9161 } else if (o->type == REDIS_SET) {
9162 /* Emit the SADDs needed to rebuild the set */
9163 dict *set = o->ptr;
9164 dictIterator *di = dictGetIterator(set);
9165 dictEntry *de;
9166
9167 while((de = dictNext(di)) != NULL) {
9168 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9169 robj *eleobj = dictGetEntryKey(de);
9170
9171 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9172 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9173 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 9174 }
9175 dictReleaseIterator(di);
9176 } else if (o->type == REDIS_ZSET) {
9177 /* Emit the ZADDs needed to rebuild the sorted set */
9178 zset *zs = o->ptr;
9179 dictIterator *di = dictGetIterator(zs->dict);
9180 dictEntry *de;
9181
9182 while((de = dictNext(di)) != NULL) {
9183 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9184 robj *eleobj = dictGetEntryKey(de);
9185 double *score = dictGetEntryVal(de);
9186
9187 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9188 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9d65a1bb 9189 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 9190 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 9191 }
9192 dictReleaseIterator(di);
9c8e3cee 9193 } else if (o->type == REDIS_HASH) {
9194 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9195
9196 /* Emit the HSETs needed to rebuild the hash */
9197 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9198 unsigned char *p = zipmapRewind(o->ptr);
9199 unsigned char *field, *val;
9200 unsigned int flen, vlen;
9201
9202 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9203 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9204 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9205 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9206 return -1;
9207 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9208 return -1;
9209 }
9210 } else {
9211 dictIterator *di = dictGetIterator(o->ptr);
9212 dictEntry *de;
9213
9214 while((de = dictNext(di)) != NULL) {
9215 robj *field = dictGetEntryKey(de);
9216 robj *val = dictGetEntryVal(de);
9217
9218 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9219 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9220 if (fwriteBulkObject(fp,field) == -1) return -1;
9221 if (fwriteBulkObject(fp,val) == -1) return -1;
9222 }
9223 dictReleaseIterator(di);
9224 }
9d65a1bb 9225 } else {
f83c6cb5 9226 redisPanic("Unknown object type");
9d65a1bb 9227 }
9228 /* Save the expire time */
9229 if (expiretime != -1) {
e96e4fbf 9230 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 9231 /* If this key is already expired skip it */
9232 if (expiretime < now) continue;
9233 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9234 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9eaef89f 9235 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9d65a1bb 9236 }
b9bc0eef 9237 if (swapped) decrRefCount(o);
9d65a1bb 9238 }
9239 dictReleaseIterator(di);
9240 }
9241
9242 /* Make sure data will not remain on the OS's output buffers */
9243 fflush(fp);
b0bd87f6 9244 aof_fsync(fileno(fp));
9d65a1bb 9245 fclose(fp);
e0a62c7f 9246
9d65a1bb 9247 /* Use RENAME to make sure the DB file is changed atomically only
9248 * if the generate DB file is ok. */
9249 if (rename(tmpfile,filename) == -1) {
9250 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9251 unlink(tmpfile);
9252 return REDIS_ERR;
9253 }
9254 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9255 return REDIS_OK;
9256
9257werr:
9258 fclose(fp);
9259 unlink(tmpfile);
e96e4fbf 9260 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 9261 if (di) dictReleaseIterator(di);
9262 return REDIS_ERR;
9263}
9264
9265/* This is how rewriting of the append only file in background works:
9266 *
9267 * 1) The user calls BGREWRITEAOF
9268 * 2) Redis calls this function, that forks():
9269 * 2a) the child rewrite the append only file in a temp file.
9270 * 2b) the parent accumulates differences in server.bgrewritebuf.
9271 * 3) When the child finished '2a' exists.
9272 * 4) The parent will trap the exit code, if it's OK, will append the
9273 * data accumulated into server.bgrewritebuf into the temp file, and
9274 * finally will rename(2) the temp file in the actual file name.
9275 * The the new file is reopened as the new append only file. Profit!
9276 */
9277static int rewriteAppendOnlyFileBackground(void) {
9278 pid_t childpid;
9279
9280 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 9281 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 9282 if ((childpid = fork()) == 0) {
9283 /* Child */
9284 char tmpfile[256];
9d65a1bb 9285
054e426d 9286 if (server.vm_enabled) vmReopenSwapFile();
9287 close(server.fd);
9d65a1bb 9288 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9289 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 9290 _exit(0);
9d65a1bb 9291 } else {
478c2c6f 9292 _exit(1);
9d65a1bb 9293 }
9294 } else {
9295 /* Parent */
9296 if (childpid == -1) {
9297 redisLog(REDIS_WARNING,
9298 "Can't rewrite append only file in background: fork: %s",
9299 strerror(errno));
9300 return REDIS_ERR;
9301 }
9302 redisLog(REDIS_NOTICE,
9303 "Background append only file rewriting started by pid %d",childpid);
9304 server.bgrewritechildpid = childpid;
884d4b39 9305 updateDictResizePolicy();
85a83172 9306 /* We set appendseldb to -1 in order to force the next call to the
9307 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9308 * accumulated by the parent into server.bgrewritebuf will start
9309 * with a SELECT statement and it will be safe to merge. */
9310 server.appendseldb = -1;
9d65a1bb 9311 return REDIS_OK;
9312 }
9313 return REDIS_OK; /* unreached */
9314}
9315
9316static void bgrewriteaofCommand(redisClient *c) {
9317 if (server.bgrewritechildpid != -1) {
9318 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9319 return;
9320 }
9321 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 9322 char *status = "+Background append only file rewriting started\r\n";
9323 addReplySds(c,sdsnew(status));
9d65a1bb 9324 } else {
9325 addReply(c,shared.err);
9326 }
9327}
9328
9329static void aofRemoveTempFile(pid_t childpid) {
9330 char tmpfile[256];
9331
9332 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9333 unlink(tmpfile);
9334}
9335
996cb5f7 9336/* Virtual Memory is composed mainly of two subsystems:
9337 * - Blocking Virutal Memory
9338 * - Threaded Virtual Memory I/O
9339 * The two parts are not fully decoupled, but functions are split among two
9340 * different sections of the source code (delimited by comments) in order to
9341 * make more clear what functionality is about the blocking VM and what about
9342 * the threaded (not blocking) VM.
9343 *
9344 * Redis VM design:
9345 *
9346 * Redis VM is a blocking VM (one that blocks reading swapped values from
9347 * disk into memory when a value swapped out is needed in memory) that is made
9348 * unblocking by trying to examine the command argument vector in order to
9349 * load in background values that will likely be needed in order to exec
9350 * the command. The command is executed only once all the relevant keys
9351 * are loaded into memory.
9352 *
9353 * This basically is almost as simple of a blocking VM, but almost as parallel
9354 * as a fully non-blocking VM.
9355 */
9356
560db612 9357/* =================== Virtual Memory - Blocking Side ====================== */
2e5eb04e 9358
560db612 9359/* Create a VM pointer object. This kind of objects are used in place of
9360 * values in the key -> value hash table, for swapped out objects. */
9361static vmpointer *createVmPointer(int vtype) {
9362 vmpointer *vp = zmalloc(sizeof(vmpointer));
2e5eb04e 9363
560db612 9364 vp->type = REDIS_VMPOINTER;
9365 vp->storage = REDIS_VM_SWAPPED;
9366 vp->vtype = vtype;
9367 return vp;
2e5eb04e 9368}
9369
75680a3c 9370static void vmInit(void) {
9371 off_t totsize;
996cb5f7 9372 int pipefds[2];
bcaa7a4f 9373 size_t stacksize;
8b5bb414 9374 struct flock fl;
75680a3c 9375
4ad37480 9376 if (server.vm_max_threads != 0)
9377 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9378
054e426d 9379 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 9380 /* Try to open the old swap file, otherwise create it */
6fa987e3 9381 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9382 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9383 }
75680a3c 9384 if (server.vm_fp == NULL) {
6fa987e3 9385 redisLog(REDIS_WARNING,
8b5bb414 9386 "Can't open the swap file: %s. Exiting.",
6fa987e3 9387 strerror(errno));
75680a3c 9388 exit(1);
9389 }
9390 server.vm_fd = fileno(server.vm_fp);
8b5bb414 9391 /* Lock the swap file for writing, this is useful in order to avoid
9392 * another instance to use the same swap file for a config error. */
9393 fl.l_type = F_WRLCK;
9394 fl.l_whence = SEEK_SET;
9395 fl.l_start = fl.l_len = 0;
9396 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9397 redisLog(REDIS_WARNING,
9398 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9399 exit(1);
9400 }
9401 /* Initialize */
75680a3c 9402 server.vm_next_page = 0;
9403 server.vm_near_pages = 0;
7d98e08c 9404 server.vm_stats_used_pages = 0;
9405 server.vm_stats_swapped_objects = 0;
9406 server.vm_stats_swapouts = 0;
9407 server.vm_stats_swapins = 0;
75680a3c 9408 totsize = server.vm_pages*server.vm_page_size;
9409 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9410 if (ftruncate(server.vm_fd,totsize) == -1) {
9411 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9412 strerror(errno));
9413 exit(1);
9414 } else {
9415 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9416 }
7d30035d 9417 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 9418 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 9419 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 9420 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 9421
996cb5f7 9422 /* Initialize threaded I/O (used by Virtual Memory) */
9423 server.io_newjobs = listCreate();
9424 server.io_processing = listCreate();
9425 server.io_processed = listCreate();
d5d55fc3 9426 server.io_ready_clients = listCreate();
92f8e882 9427 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 9428 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9429 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 9430 server.io_active_threads = 0;
996cb5f7 9431 if (pipe(pipefds) == -1) {
9432 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9433 ,strerror(errno));
9434 exit(1);
9435 }
9436 server.io_ready_pipe_read = pipefds[0];
9437 server.io_ready_pipe_write = pipefds[1];
9438 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 9439 /* LZF requires a lot of stack */
9440 pthread_attr_init(&server.io_threads_attr);
9441 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9442 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9443 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 9444 /* Listen for events in the threaded I/O pipe */
9445 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9446 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9447 oom("creating file event");
75680a3c 9448}
9449
06224fec 9450/* Mark the page as used */
9451static void vmMarkPageUsed(off_t page) {
9452 off_t byte = page/8;
9453 int bit = page&7;
970e10bb 9454 redisAssert(vmFreePage(page) == 1);
06224fec 9455 server.vm_bitmap[byte] |= 1<<bit;
9456}
9457
9458/* Mark N contiguous pages as used, with 'page' being the first. */
9459static void vmMarkPagesUsed(off_t page, off_t count) {
9460 off_t j;
9461
9462 for (j = 0; j < count; j++)
7d30035d 9463 vmMarkPageUsed(page+j);
7d98e08c 9464 server.vm_stats_used_pages += count;
7c775e09 9465 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9466 (long long)count, (long long)page);
06224fec 9467}
9468
9469/* Mark the page as free */
9470static void vmMarkPageFree(off_t page) {
9471 off_t byte = page/8;
9472 int bit = page&7;
970e10bb 9473 redisAssert(vmFreePage(page) == 0);
06224fec 9474 server.vm_bitmap[byte] &= ~(1<<bit);
9475}
9476
9477/* Mark N contiguous pages as free, with 'page' being the first. */
9478static void vmMarkPagesFree(off_t page, off_t count) {
9479 off_t j;
9480
9481 for (j = 0; j < count; j++)
7d30035d 9482 vmMarkPageFree(page+j);
7d98e08c 9483 server.vm_stats_used_pages -= count;
7c775e09 9484 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9485 (long long)count, (long long)page);
06224fec 9486}
9487
9488/* Test if the page is free */
9489static int vmFreePage(off_t page) {
9490 off_t byte = page/8;
9491 int bit = page&7;
7d30035d 9492 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 9493}
9494
9495/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 9496 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 9497 * REDIS_ERR is returned.
06224fec 9498 *
9499 * This function uses a simple algorithm: we try to allocate
9500 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9501 * again from the start of the swap file searching for free spaces.
9502 *
9503 * If it looks pretty clear that there are no free pages near our offset
9504 * we try to find less populated places doing a forward jump of
9505 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9506 * without hurry, and then we jump again and so forth...
e0a62c7f 9507 *
06224fec 9508 * This function can be improved using a free list to avoid to guess
9509 * too much, since we could collect data about freed pages.
9510 *
9511 * note: I implemented this function just after watching an episode of
9512 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9513 */
c7df85a4 9514static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 9515 off_t base, offset = 0, since_jump = 0, numfree = 0;
9516
9517 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9518 server.vm_near_pages = 0;
9519 server.vm_next_page = 0;
9520 }
9521 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9522 base = server.vm_next_page;
9523
9524 while(offset < server.vm_pages) {
9525 off_t this = base+offset;
9526
9527 /* If we overflow, restart from page zero */
9528 if (this >= server.vm_pages) {
9529 this -= server.vm_pages;
9530 if (this == 0) {
9531 /* Just overflowed, what we found on tail is no longer
9532 * interesting, as it's no longer contiguous. */
9533 numfree = 0;
9534 }
9535 }
9536 if (vmFreePage(this)) {
9537 /* This is a free page */
9538 numfree++;
9539 /* Already got N free pages? Return to the caller, with success */
9540 if (numfree == n) {
7d30035d 9541 *first = this-(n-1);
9542 server.vm_next_page = this+1;
7c775e09 9543 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9544 return REDIS_OK;
06224fec 9545 }
9546 } else {
9547 /* The current one is not a free page */
9548 numfree = 0;
9549 }
9550
9551 /* Fast-forward if the current page is not free and we already
9552 * searched enough near this place. */
9553 since_jump++;
9554 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9555 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9556 since_jump = 0;
9557 /* Note that even if we rewind after the jump, we are don't need
9558 * to make sure numfree is set to zero as we only jump *if* it
9559 * is set to zero. */
9560 } else {
9561 /* Otherwise just check the next page */
9562 offset++;
9563 }
9564 }
3a66edc7 9565 return REDIS_ERR;
9566}
9567
a5819310 9568/* Write the specified object at the specified page of the swap file */
9569static int vmWriteObjectOnSwap(robj *o, off_t page) {
9570 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9571 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9572 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9573 redisLog(REDIS_WARNING,
9ebed7cf 9574 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9575 strerror(errno));
9576 return REDIS_ERR;
9577 }
9578 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9579 fflush(server.vm_fp);
a5819310 9580 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9581 return REDIS_OK;
9582}
9583
a4798f73 9584/* Transfers the 'val' object to disk. Store all the information
9585 * a 'vmpointer' object containing all the information needed to load the
9586 * object back later is returned.
9587 *
3a66edc7 9588 * If we can't find enough contiguous empty pages to swap the object on disk
a4798f73 9589 * NULL is returned. */
560db612 9590static vmpointer *vmSwapObjectBlocking(robj *val) {
b9bc0eef 9591 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9592 off_t page;
560db612 9593 vmpointer *vp;
3a66edc7 9594
560db612 9595 assert(val->storage == REDIS_VM_MEMORY);
9596 assert(val->refcount == 1);
9597 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9598 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9599
9600 vp = createVmPointer(val->type);
9601 vp->page = page;
9602 vp->usedpages = pages;
3a66edc7 9603 decrRefCount(val); /* Deallocate the object from memory. */
9604 vmMarkPagesUsed(page,pages);
560db612 9605 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9606 (void*) val,
7d30035d 9607 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9608 server.vm_stats_swapped_objects++;
9609 server.vm_stats_swapouts++;
560db612 9610 return vp;
3a66edc7 9611}
9612
a5819310 9613static robj *vmReadObjectFromSwap(off_t page, int type) {
9614 robj *o;
3a66edc7 9615
a5819310 9616 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9617 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9618 redisLog(REDIS_WARNING,
d5d55fc3 9619 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9620 strerror(errno));
478c2c6f 9621 _exit(1);
3a66edc7 9622 }
a5819310 9623 o = rdbLoadObject(type,server.vm_fp);
9624 if (o == NULL) {
d5d55fc3 9625 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9626 _exit(1);
3a66edc7 9627 }
a5819310 9628 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9629 return o;
9630}
9631
560db612 9632/* Load the specified object from swap to memory.
a5819310 9633 * The newly allocated object is returned.
9634 *
9635 * If preview is true the unserialized object is returned to the caller but
560db612 9636 * the pages are not marked as freed, nor the vp object is freed. */
9637static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
a5819310 9638 robj *val;
9639
560db612 9640 redisAssert(vp->type == REDIS_VMPOINTER &&
9641 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9642 val = vmReadObjectFromSwap(vp->page,vp->vtype);
7e69548d 9643 if (!preview) {
560db612 9644 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9645 vmMarkPagesFree(vp->page,vp->usedpages);
9646 zfree(vp);
7d98e08c 9647 server.vm_stats_swapped_objects--;
38aba9a1 9648 } else {
560db612 9649 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
7e69548d 9650 }
7d98e08c 9651 server.vm_stats_swapins++;
3a66edc7 9652 return val;
06224fec 9653}
9654
560db612 9655/* Plain object loading, from swap to memory.
9656 *
9657 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9658 * The return value is the loaded object. */
9659static robj *vmLoadObject(robj *o) {
996cb5f7 9660 /* If we are loading the object in background, stop it, we
9661 * need to load this object synchronously ASAP. */
560db612 9662 if (o->storage == REDIS_VM_LOADING)
9663 vmCancelThreadedIOJob(o);
9664 return vmGenericLoadObject((vmpointer*)o,0);
7e69548d 9665}
9666
9667/* Just load the value on disk, without to modify the key.
9668 * This is useful when we want to perform some operation on the value
9669 * without to really bring it from swap to memory, like while saving the
9670 * dataset or rewriting the append only log. */
560db612 9671static robj *vmPreviewObject(robj *o) {
9672 return vmGenericLoadObject((vmpointer*)o,1);
7e69548d 9673}
9674
4ef8de8a 9675/* How a good candidate is this object for swapping?
9676 * The better candidate it is, the greater the returned value.
9677 *
9678 * Currently we try to perform a fast estimation of the object size in
9679 * memory, and combine it with aging informations.
9680 *
9681 * Basically swappability = idle-time * log(estimated size)
9682 *
9683 * Bigger objects are preferred over smaller objects, but not
9684 * proportionally, this is why we use the logarithm. This algorithm is
9685 * just a first try and will probably be tuned later. */
9686static double computeObjectSwappability(robj *o) {
560db612 9687 /* actual age can be >= minage, but not < minage. As we use wrapping
9688 * 21 bit clocks with minutes resolution for the LRU. */
9689 time_t minage = abs(server.lruclock - o->lru);
4e16d8b3
PN
9690 long asize = 0, elesize;
9691 robj *ele;
4ef8de8a 9692 list *l;
4e16d8b3 9693 listNode *ln;
4ef8de8a 9694 dict *d;
9695 struct dictEntry *de;
9696 int z;
9697
560db612 9698 if (minage <= 0) return 0;
4ef8de8a 9699 switch(o->type) {
9700 case REDIS_STRING:
9701 if (o->encoding != REDIS_ENCODING_RAW) {
9702 asize = sizeof(*o);
9703 } else {
9704 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9705 }
9706 break;
9707 case REDIS_LIST:
4e16d8b3
PN
9708 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9709 asize = sizeof(*o)+ziplistSize(o->ptr);
9710 } else {
9711 l = o->ptr;
9712 ln = listFirst(l);
9713 asize = sizeof(list);
9714 if (ln) {
9715 ele = ln->value;
9716 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9717 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
9718 asize += (sizeof(listNode)+elesize)*listLength(l);
9719 }
4ef8de8a 9720 }
9721 break;
9722 case REDIS_SET:
9723 case REDIS_ZSET:
9724 z = (o->type == REDIS_ZSET);
9725 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9726
9727 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9728 if (z) asize += sizeof(zset)-sizeof(dict);
9729 if (dictSize(d)) {
4ef8de8a 9730 de = dictGetRandomKey(d);
9731 ele = dictGetEntryKey(de);
9732 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9733 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
4ef8de8a 9734 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9735 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9736 }
9737 break;
a97b9060 9738 case REDIS_HASH:
9739 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9740 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9741 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9742 unsigned int klen, vlen;
9743 unsigned char *key, *val;
9744
9745 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9746 klen = 0;
9747 vlen = 0;
9748 }
9749 asize = len*(klen+vlen+3);
9750 } else if (o->encoding == REDIS_ENCODING_HT) {
9751 d = o->ptr;
9752 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9753 if (dictSize(d)) {
a97b9060 9754 de = dictGetRandomKey(d);
9755 ele = dictGetEntryKey(de);
9756 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9757 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9758 ele = dictGetEntryVal(de);
9759 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9760 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9761 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9762 }
9763 }
9764 break;
4ef8de8a 9765 }
560db612 9766 return (double)minage*log(1+asize);
4ef8de8a 9767}
9768
9769/* Try to swap an object that's a good candidate for swapping.
9770 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9771 * to swap any object at all.
9772 *
9773 * If 'usethreaded' is true, Redis will try to swap the object in background
9774 * using I/O threads. */
9775static int vmSwapOneObject(int usethreads) {
4ef8de8a 9776 int j, i;
9777 struct dictEntry *best = NULL;
9778 double best_swappability = 0;
b9bc0eef 9779 redisDb *best_db = NULL;
44262c58 9780 robj *val;
9781 sds key;
4ef8de8a 9782
9783 for (j = 0; j < server.dbnum; j++) {
9784 redisDb *db = server.db+j;
b72f6a4b 9785 /* Why maxtries is set to 100?
9786 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9787 * are swappable objects */
b0d8747d 9788 int maxtries = 100;
4ef8de8a 9789
9790 if (dictSize(db->dict) == 0) continue;
9791 for (i = 0; i < 5; i++) {
9792 dictEntry *de;
9793 double swappability;
9794
e3cadb8a 9795 if (maxtries) maxtries--;
4ef8de8a 9796 de = dictGetRandomKey(db->dict);
4ef8de8a 9797 val = dictGetEntryVal(de);
1064ef87 9798 /* Only swap objects that are currently in memory.
9799 *
560db612 9800 * Also don't swap shared objects: not a good idea in general and
9801 * we need to ensure that the main thread does not touch the
1064ef87 9802 * object while the I/O thread is using it, but we can't
9803 * control other keys without adding additional mutex. */
560db612 9804 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
e3cadb8a 9805 if (maxtries) i--; /* don't count this try */
9806 continue;
9807 }
4ef8de8a 9808 swappability = computeObjectSwappability(val);
9809 if (!best || swappability > best_swappability) {
9810 best = de;
9811 best_swappability = swappability;
b9bc0eef 9812 best_db = db;
4ef8de8a 9813 }
9814 }
9815 }
7c775e09 9816 if (best == NULL) return REDIS_ERR;
4ef8de8a 9817 key = dictGetEntryKey(best);
9818 val = dictGetEntryVal(best);
9819
e3cadb8a 9820 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
44262c58 9821 key, best_swappability);
4ef8de8a 9822
4ef8de8a 9823 /* Swap it */
a69a0c9c 9824 if (usethreads) {
4c8f2370 9825 robj *keyobj = createStringObject(key,sdslen(key));
9826 vmSwapObjectThreaded(keyobj,val,best_db);
9827 decrRefCount(keyobj);
4ef8de8a 9828 return REDIS_OK;
9829 } else {
560db612 9830 vmpointer *vp;
9831
9832 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9833 dictGetEntryVal(best) = vp;
a69a0c9c 9834 return REDIS_OK;
9835 } else {
9836 return REDIS_ERR;
9837 }
4ef8de8a 9838 }
9839}
9840
a69a0c9c 9841static int vmSwapOneObjectBlocking() {
9842 return vmSwapOneObject(0);
9843}
9844
9845static int vmSwapOneObjectThreaded() {
9846 return vmSwapOneObject(1);
9847}
9848
7e69548d 9849/* Return true if it's safe to swap out objects in a given moment.
9850 * Basically we don't want to swap objects out while there is a BGSAVE
9851 * or a BGAEOREWRITE running in backgroud. */
9852static int vmCanSwapOut(void) {
9853 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9854}
9855
996cb5f7 9856/* =================== Virtual Memory - Threaded I/O ======================= */
9857
b9bc0eef 9858static void freeIOJob(iojob *j) {
d5d55fc3 9859 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9860 j->type == REDIS_IOJOB_DO_SWAP ||
9861 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
560db612 9862 {
e4ed181d 9863 /* we fix the storage type, otherwise decrRefCount() will try to
9864 * kill the I/O thread Job (that does no longer exists). */
9865 if (j->val->storage == REDIS_VM_SWAPPING)
560db612 9866 j->val->storage = REDIS_VM_MEMORY;
b9bc0eef 9867 decrRefCount(j->val);
560db612 9868 }
9869 decrRefCount(j->key);
b9bc0eef 9870 zfree(j);
9871}
9872
996cb5f7 9873/* Every time a thread finished a Job, it writes a byte into the write side
9874 * of an unix pipe in order to "awake" the main thread, and this function
9875 * is called. */
9876static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9877 int mask)
9878{
9879 char buf[1];
b0d8747d 9880 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9881 REDIS_NOTUSED(el);
9882 REDIS_NOTUSED(mask);
9883 REDIS_NOTUSED(privdata);
9884
9885 /* For every byte we read in the read side of the pipe, there is one
9886 * I/O job completed to process. */
9887 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9888 iojob *j;
9889 listNode *ln;
b9bc0eef 9890 struct dictEntry *de;
9891
996cb5f7 9892 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9893
9894 /* Get the processed element (the oldest one) */
9895 lockThreadedIO();
1064ef87 9896 assert(listLength(server.io_processed) != 0);
f6c0bba8 9897 if (toprocess == -1) {
9898 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9899 if (toprocess <= 0) toprocess = 1;
9900 }
b9bc0eef 9901 ln = listFirst(server.io_processed);
9902 j = ln->value;
9903 listDelNode(server.io_processed,ln);
9904 unlockThreadedIO();
9905 /* If this job is marked as canceled, just ignore it */
9906 if (j->canceled) {
9907 freeIOJob(j);
9908 continue;
9909 }
9910 /* Post process it in the main thread, as there are things we
9911 * can do just here to avoid race conditions and/or invasive locks */
560db612 9912 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
44262c58 9913 de = dictFind(j->db->dict,j->key->ptr);
e4ed181d 9914 redisAssert(de != NULL);
b9bc0eef 9915 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9916 redisDb *db;
560db612 9917 vmpointer *vp = dictGetEntryVal(de);
d5d55fc3 9918
b9bc0eef 9919 /* Key loaded, bring it at home */
560db612 9920 vmMarkPagesFree(vp->page,vp->usedpages);
b9bc0eef 9921 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
560db612 9922 (unsigned char*) j->key->ptr);
b9bc0eef 9923 server.vm_stats_swapped_objects--;
9924 server.vm_stats_swapins++;
d5d55fc3 9925 dictGetEntryVal(de) = j->val;
9926 incrRefCount(j->val);
9927 db = j->db;
d5d55fc3 9928 /* Handle clients waiting for this key to be loaded. */
560db612 9929 handleClientsBlockedOnSwappedKey(db,j->key);
9930 freeIOJob(j);
9931 zfree(vp);
b9bc0eef 9932 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9933 /* Now we know the amount of pages required to swap this object.
9934 * Let's find some space for it, and queue this task again
9935 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9936 if (!vmCanSwapOut() ||
9937 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9938 {
9939 /* Ooops... no space or we can't swap as there is
9940 * a fork()ed Redis trying to save stuff on disk. */
560db612 9941 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9942 freeIOJob(j);
9943 } else {
c7df85a4 9944 /* Note that we need to mark this pages as used now,
9945 * if the job will be canceled, we'll mark them as freed
9946 * again. */
9947 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9948 j->type = REDIS_IOJOB_DO_SWAP;
9949 lockThreadedIO();
9950 queueIOJob(j);
9951 unlockThreadedIO();
9952 }
9953 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
560db612 9954 vmpointer *vp;
b9bc0eef 9955
9956 /* Key swapped. We can finally free some memory. */
560db612 9957 if (j->val->storage != REDIS_VM_SWAPPING) {
9958 vmpointer *vp = (vmpointer*) j->id;
9959 printf("storage: %d\n",vp->storage);
9960 printf("key->name: %s\n",(char*)j->key->ptr);
6c96ba7d 9961 printf("val: %p\n",(void*)j->val);
9962 printf("val->type: %d\n",j->val->type);
9963 printf("val->ptr: %s\n",(char*)j->val->ptr);
9964 }
560db612 9965 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9966 vp = createVmPointer(j->val->type);
9967 vp->page = j->page;
9968 vp->usedpages = j->pages;
9969 dictGetEntryVal(de) = vp;
e4ed181d 9970 /* Fix the storage otherwise decrRefCount will attempt to
9971 * remove the associated I/O job */
9972 j->val->storage = REDIS_VM_MEMORY;
560db612 9973 decrRefCount(j->val);
b9bc0eef 9974 redisLog(REDIS_DEBUG,
9975 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
560db612 9976 (unsigned char*) j->key->ptr,
b9bc0eef 9977 (unsigned long long) j->page, (unsigned long long) j->pages);
9978 server.vm_stats_swapped_objects++;
9979 server.vm_stats_swapouts++;
9980 freeIOJob(j);
f11b8647 9981 /* Put a few more swap requests in queue if we are still
9982 * out of memory */
b0d8747d 9983 if (trytoswap && vmCanSwapOut() &&
9984 zmalloc_used_memory() > server.vm_max_memory)
9985 {
f11b8647 9986 int more = 1;
9987 while(more) {
9988 lockThreadedIO();
9989 more = listLength(server.io_newjobs) <
9990 (unsigned) server.vm_max_threads;
9991 unlockThreadedIO();
9992 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9993 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9994 trytoswap = 0;
9995 break;
9996 }
f11b8647 9997 }
9998 }
b9bc0eef 9999 }
c953f24b 10000 processed++;
f6c0bba8 10001 if (processed == toprocess) return;
996cb5f7 10002 }
10003 if (retval < 0 && errno != EAGAIN) {
10004 redisLog(REDIS_WARNING,
10005 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
10006 strerror(errno));
10007 }
10008}
10009
10010static void lockThreadedIO(void) {
10011 pthread_mutex_lock(&server.io_mutex);
10012}
10013
10014static void unlockThreadedIO(void) {
10015 pthread_mutex_unlock(&server.io_mutex);
10016}
10017
10018/* Remove the specified object from the threaded I/O queue if still not
10019 * processed, otherwise make sure to flag it as canceled. */
10020static void vmCancelThreadedIOJob(robj *o) {
10021 list *lists[3] = {
6c96ba7d 10022 server.io_newjobs, /* 0 */
10023 server.io_processing, /* 1 */
10024 server.io_processed /* 2 */
996cb5f7 10025 };
10026 int i;
10027
10028 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 10029again:
996cb5f7 10030 lockThreadedIO();
560db612 10031 /* Search for a matching object in one of the queues */
996cb5f7 10032 for (i = 0; i < 3; i++) {
10033 listNode *ln;
c7df85a4 10034 listIter li;
996cb5f7 10035
c7df85a4 10036 listRewind(lists[i],&li);
10037 while ((ln = listNext(&li)) != NULL) {
996cb5f7 10038 iojob *job = ln->value;
10039
6c96ba7d 10040 if (job->canceled) continue; /* Skip this, already canceled. */
560db612 10041 if (job->id == o) {
dbc289ae 10042 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10043 (void*)job, (char*)job->key->ptr, job->type, i);
427a2153 10044 /* Mark the pages as free since the swap didn't happened
10045 * or happened but is now discarded. */
970e10bb 10046 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 10047 vmMarkPagesFree(job->page,job->pages);
10048 /* Cancel the job. It depends on the list the job is
10049 * living in. */
996cb5f7 10050 switch(i) {
10051 case 0: /* io_newjobs */
6c96ba7d 10052 /* If the job was yet not processed the best thing to do
996cb5f7 10053 * is to remove it from the queue at all */
6c96ba7d 10054 freeIOJob(job);
996cb5f7 10055 listDelNode(lists[i],ln);
10056 break;
10057 case 1: /* io_processing */
d5d55fc3 10058 /* Oh Shi- the thread is messing with the Job:
10059 *
10060 * Probably it's accessing the object if this is a
10061 * PREPARE_SWAP or DO_SWAP job.
10062 * If it's a LOAD job it may be reading from disk and
10063 * if we don't wait for the job to terminate before to
10064 * cancel it, maybe in a few microseconds data can be
10065 * corrupted in this pages. So the short story is:
10066 *
10067 * Better to wait for the job to move into the
10068 * next queue (processed)... */
10069
10070 /* We try again and again until the job is completed. */
10071 unlockThreadedIO();
10072 /* But let's wait some time for the I/O thread
10073 * to finish with this job. After all this condition
10074 * should be very rare. */
10075 usleep(1);
10076 goto again;
996cb5f7 10077 case 2: /* io_processed */
2e111efe 10078 /* The job was already processed, that's easy...
10079 * just mark it as canceled so that we'll ignore it
10080 * when processing completed jobs. */
996cb5f7 10081 job->canceled = 1;
10082 break;
10083 }
c7df85a4 10084 /* Finally we have to adjust the storage type of the object
10085 * in order to "UNDO" the operaiton. */
996cb5f7 10086 if (o->storage == REDIS_VM_LOADING)
10087 o->storage = REDIS_VM_SWAPPED;
10088 else if (o->storage == REDIS_VM_SWAPPING)
10089 o->storage = REDIS_VM_MEMORY;
10090 unlockThreadedIO();
e4ed181d 10091 redisLog(REDIS_DEBUG,"*** DONE");
996cb5f7 10092 return;
10093 }
10094 }
10095 }
10096 unlockThreadedIO();
560db612 10097 printf("Not found: %p\n", (void*)o);
10098 redisAssert(1 != 1); /* We should never reach this */
996cb5f7 10099}
10100
b9bc0eef 10101static void *IOThreadEntryPoint(void *arg) {
10102 iojob *j;
10103 listNode *ln;
10104 REDIS_NOTUSED(arg);
10105
10106 pthread_detach(pthread_self());
10107 while(1) {
10108 /* Get a new job to process */
10109 lockThreadedIO();
10110 if (listLength(server.io_newjobs) == 0) {
10111 /* No new jobs in queue, exit. */
9ebed7cf 10112 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10113 (long) pthread_self());
b9bc0eef 10114 server.io_active_threads--;
10115 unlockThreadedIO();
10116 return NULL;
10117 }
10118 ln = listFirst(server.io_newjobs);
10119 j = ln->value;
10120 listDelNode(server.io_newjobs,ln);
10121 /* Add the job in the processing queue */
10122 j->thread = pthread_self();
10123 listAddNodeTail(server.io_processing,j);
10124 ln = listLast(server.io_processing); /* We use ln later to remove it */
10125 unlockThreadedIO();
9ebed7cf 10126 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10127 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 10128
10129 /* Process the Job */
10130 if (j->type == REDIS_IOJOB_LOAD) {
560db612 10131 vmpointer *vp = (vmpointer*)j->id;
10132 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
b9bc0eef 10133 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10134 FILE *fp = fopen("/dev/null","w+");
10135 j->pages = rdbSavedObjectPages(j->val,fp);
10136 fclose(fp);
10137 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 10138 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10139 j->canceled = 1;
b9bc0eef 10140 }
10141
10142 /* Done: insert the job into the processed queue */
9ebed7cf 10143 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10144 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 10145 lockThreadedIO();
10146 listDelNode(server.io_processing,ln);
10147 listAddNodeTail(server.io_processed,j);
10148 unlockThreadedIO();
e0a62c7f 10149
b9bc0eef 10150 /* Signal the main thread there is new stuff to process */
10151 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10152 }
10153 return NULL; /* never reached */
10154}
10155
10156static void spawnIOThread(void) {
10157 pthread_t thread;
478c2c6f 10158 sigset_t mask, omask;
a97b9060 10159 int err;
b9bc0eef 10160
478c2c6f 10161 sigemptyset(&mask);
10162 sigaddset(&mask,SIGCHLD);
10163 sigaddset(&mask,SIGHUP);
10164 sigaddset(&mask,SIGPIPE);
10165 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 10166 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10167 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10168 strerror(err));
10169 usleep(1000000);
10170 }
478c2c6f 10171 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 10172 server.io_active_threads++;
10173}
10174
4ee9488d 10175/* We need to wait for the last thread to exit before we are able to
10176 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 10177static void waitEmptyIOJobsQueue(void) {
4ee9488d 10178 while(1) {
76b7233a 10179 int io_processed_len;
10180
4ee9488d 10181 lockThreadedIO();
054e426d 10182 if (listLength(server.io_newjobs) == 0 &&
10183 listLength(server.io_processing) == 0 &&
10184 server.io_active_threads == 0)
10185 {
4ee9488d 10186 unlockThreadedIO();
10187 return;
10188 }
76b7233a 10189 /* While waiting for empty jobs queue condition we post-process some
10190 * finshed job, as I/O threads may be hanging trying to write against
10191 * the io_ready_pipe_write FD but there are so much pending jobs that
10192 * it's blocking. */
10193 io_processed_len = listLength(server.io_processed);
4ee9488d 10194 unlockThreadedIO();
76b7233a 10195 if (io_processed_len) {
10196 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10197 usleep(1000); /* 1 millisecond */
10198 } else {
10199 usleep(10000); /* 10 milliseconds */
10200 }
4ee9488d 10201 }
10202}
10203
054e426d 10204static void vmReopenSwapFile(void) {
478c2c6f 10205 /* Note: we don't close the old one as we are in the child process
10206 * and don't want to mess at all with the original file object. */
054e426d 10207 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10208 if (server.vm_fp == NULL) {
10209 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10210 server.vm_swap_file);
478c2c6f 10211 _exit(1);
054e426d 10212 }
10213 server.vm_fd = fileno(server.vm_fp);
10214}
10215
b9bc0eef 10216/* This function must be called while with threaded IO locked */
10217static void queueIOJob(iojob *j) {
6c96ba7d 10218 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10219 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 10220 listAddNodeTail(server.io_newjobs,j);
10221 if (server.io_active_threads < server.vm_max_threads)
10222 spawnIOThread();
10223}
10224
10225static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10226 iojob *j;
e0a62c7f 10227
b9bc0eef 10228 j = zmalloc(sizeof(*j));
10229 j->type = REDIS_IOJOB_PREPARE_SWAP;
10230 j->db = db;
78ebe4c8 10231 j->key = key;
7dd8e7cf 10232 incrRefCount(key);
560db612 10233 j->id = j->val = val;
b9bc0eef 10234 incrRefCount(val);
10235 j->canceled = 0;
10236 j->thread = (pthread_t) -1;
560db612 10237 val->storage = REDIS_VM_SWAPPING;
b9bc0eef 10238
10239 lockThreadedIO();
10240 queueIOJob(j);
10241 unlockThreadedIO();
10242 return REDIS_OK;
10243}
10244
b0d8747d 10245/* ============ Virtual Memory - Blocking clients on missing keys =========== */
10246
d5d55fc3 10247/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10248 * If there is not already a job loading the key, it is craeted.
10249 * The key is added to the io_keys list in the client structure, and also
10250 * in the hash table mapping swapped keys to waiting clients, that is,
10251 * server.io_waited_keys. */
10252static int waitForSwappedKey(redisClient *c, robj *key) {
10253 struct dictEntry *de;
10254 robj *o;
10255 list *l;
10256
10257 /* If the key does not exist or is already in RAM we don't need to
10258 * block the client at all. */
09241813 10259 de = dictFind(c->db->dict,key->ptr);
d5d55fc3 10260 if (de == NULL) return 0;
560db612 10261 o = dictGetEntryVal(de);
d5d55fc3 10262 if (o->storage == REDIS_VM_MEMORY) {
10263 return 0;
10264 } else if (o->storage == REDIS_VM_SWAPPING) {
10265 /* We were swapping the key, undo it! */
10266 vmCancelThreadedIOJob(o);
10267 return 0;
10268 }
e0a62c7f 10269
d5d55fc3 10270 /* OK: the key is either swapped, or being loaded just now. */
10271
10272 /* Add the key to the list of keys this client is waiting for.
10273 * This maps clients to keys they are waiting for. */
10274 listAddNodeTail(c->io_keys,key);
10275 incrRefCount(key);
10276
10277 /* Add the client to the swapped keys => clients waiting map. */
10278 de = dictFind(c->db->io_keys,key);
10279 if (de == NULL) {
10280 int retval;
10281
10282 /* For every key we take a list of clients blocked for it */
10283 l = listCreate();
10284 retval = dictAdd(c->db->io_keys,key,l);
10285 incrRefCount(key);
10286 assert(retval == DICT_OK);
10287 } else {
10288 l = dictGetEntryVal(de);
10289 }
10290 listAddNodeTail(l,c);
10291
10292 /* Are we already loading the key from disk? If not create a job */
10293 if (o->storage == REDIS_VM_SWAPPED) {
10294 iojob *j;
560db612 10295 vmpointer *vp = (vmpointer*)o;
d5d55fc3 10296
10297 o->storage = REDIS_VM_LOADING;
10298 j = zmalloc(sizeof(*j));
10299 j->type = REDIS_IOJOB_LOAD;
10300 j->db = c->db;
560db612 10301 j->id = (robj*)vp;
10302 j->key = key;
10303 incrRefCount(key);
10304 j->page = vp->page;
d5d55fc3 10305 j->val = NULL;
10306 j->canceled = 0;
10307 j->thread = (pthread_t) -1;
10308 lockThreadedIO();
10309 queueIOJob(j);
10310 unlockThreadedIO();
10311 }
10312 return 1;
10313}
10314
6f078746
PN
10315/* Preload keys for any command with first, last and step values for
10316 * the command keys prototype, as defined in the command table. */
10317static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10318 int j, last;
10319 if (cmd->vm_firstkey == 0) return;
10320 last = cmd->vm_lastkey;
10321 if (last < 0) last = argc+last;
10322 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10323 redisAssert(j < argc);
10324 waitForSwappedKey(c,argv[j]);
10325 }
10326}
10327
5d373da9 10328/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
10329 * Note that the number of keys to preload is user-defined, so we need to
10330 * apply a sanity check against argc. */
ca1788b5 10331static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 10332 int i, num;
ca1788b5 10333 REDIS_NOTUSED(cmd);
ca1788b5
PN
10334
10335 num = atoi(argv[2]->ptr);
739ba0d2 10336 if (num > (argc-3)) return;
76583ea4 10337 for (i = 0; i < num; i++) {
ca1788b5 10338 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
10339 }
10340}
10341
3805e04f
PN
10342/* Preload keys needed to execute the entire MULTI/EXEC block.
10343 *
10344 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10345 * and will block the client when any command requires a swapped out value. */
10346static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10347 int i, margc;
10348 struct redisCommand *mcmd;
10349 robj **margv;
10350 REDIS_NOTUSED(cmd);
10351 REDIS_NOTUSED(argc);
10352 REDIS_NOTUSED(argv);
10353
10354 if (!(c->flags & REDIS_MULTI)) return;
10355 for (i = 0; i < c->mstate.count; i++) {
10356 mcmd = c->mstate.commands[i].cmd;
10357 margc = c->mstate.commands[i].argc;
10358 margv = c->mstate.commands[i].argv;
10359
10360 if (mcmd->vm_preload_proc != NULL) {
10361 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10362 } else {
10363 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10364 }
76583ea4
PN
10365 }
10366}
10367
b0d8747d 10368/* Is this client attempting to run a command against swapped keys?
d5d55fc3 10369 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 10370 *
d5d55fc3 10371 * The important idea about this function is that it can fail! If keys will
10372 * still be swapped when the client is resumed, this key lookups will
10373 * just block loading keys from disk. In practical terms this should only
10374 * happen with SORT BY command or if there is a bug in this function.
10375 *
10376 * Return 1 if the client is marked as blocked, 0 if the client can
10377 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 10378static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 10379 if (cmd->vm_preload_proc != NULL) {
ca1788b5 10380 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 10381 } else {
6f078746 10382 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
10383 }
10384
d5d55fc3 10385 /* If the client was blocked for at least one key, mark it as blocked. */
10386 if (listLength(c->io_keys)) {
10387 c->flags |= REDIS_IO_WAIT;
10388 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10389 server.vm_blocked_clients++;
10390 return 1;
10391 } else {
10392 return 0;
10393 }
10394}
10395
10396/* Remove the 'key' from the list of blocked keys for a given client.
10397 *
10398 * The function returns 1 when there are no longer blocking keys after
10399 * the current one was removed (and the client can be unblocked). */
10400static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10401 list *l;
10402 listNode *ln;
10403 listIter li;
10404 struct dictEntry *de;
10405
10406 /* Remove the key from the list of keys this client is waiting for. */
10407 listRewind(c->io_keys,&li);
10408 while ((ln = listNext(&li)) != NULL) {
bf028098 10409 if (equalStringObjects(ln->value,key)) {
d5d55fc3 10410 listDelNode(c->io_keys,ln);
10411 break;
10412 }
10413 }
10414 assert(ln != NULL);
10415
10416 /* Remove the client form the key => waiting clients map. */
10417 de = dictFind(c->db->io_keys,key);
10418 assert(de != NULL);
10419 l = dictGetEntryVal(de);
10420 ln = listSearchKey(l,c);
10421 assert(ln != NULL);
10422 listDelNode(l,ln);
10423 if (listLength(l) == 0)
10424 dictDelete(c->db->io_keys,key);
10425
10426 return listLength(c->io_keys) == 0;
10427}
10428
560db612 10429/* Every time we now a key was loaded back in memory, we handle clients
10430 * waiting for this key if any. */
d5d55fc3 10431static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10432 struct dictEntry *de;
10433 list *l;
10434 listNode *ln;
10435 int len;
10436
10437 de = dictFind(db->io_keys,key);
10438 if (!de) return;
10439
10440 l = dictGetEntryVal(de);
10441 len = listLength(l);
10442 /* Note: we can't use something like while(listLength(l)) as the list
10443 * can be freed by the calling function when we remove the last element. */
10444 while (len--) {
10445 ln = listFirst(l);
10446 redisClient *c = ln->value;
10447
10448 if (dontWaitForSwappedKey(c,key)) {
10449 /* Put the client in the list of clients ready to go as we
10450 * loaded all the keys about it. */
10451 listAddNodeTail(server.io_ready_clients,c);
10452 }
10453 }
b0d8747d 10454}
b0d8747d 10455
500ece7c 10456/* =========================== Remote Configuration ========================= */
10457
10458static void configSetCommand(redisClient *c) {
10459 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 10460 long long ll;
10461
500ece7c 10462 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10463 zfree(server.dbfilename);
10464 server.dbfilename = zstrdup(o->ptr);
10465 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10466 zfree(server.requirepass);
10467 server.requirepass = zstrdup(o->ptr);
10468 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10469 zfree(server.masterauth);
10470 server.masterauth = zstrdup(o->ptr);
10471 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 10472 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10473 ll < 0) goto badfmt;
10474 server.maxmemory = ll;
10475 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10476 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10477 ll < 0 || ll > LONG_MAX) goto badfmt;
10478 server.maxidletime = ll;
1b677732 10479 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10480 if (!strcasecmp(o->ptr,"no")) {
10481 server.appendfsync = APPENDFSYNC_NO;
10482 } else if (!strcasecmp(o->ptr,"everysec")) {
10483 server.appendfsync = APPENDFSYNC_EVERYSEC;
10484 } else if (!strcasecmp(o->ptr,"always")) {
10485 server.appendfsync = APPENDFSYNC_ALWAYS;
10486 } else {
10487 goto badfmt;
10488 }
38db9171 10489 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10490 int yn = yesnotoi(o->ptr);
10491
10492 if (yn == -1) goto badfmt;
10493 server.no_appendfsync_on_rewrite = yn;
2e5eb04e 10494 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10495 int old = server.appendonly;
10496 int new = yesnotoi(o->ptr);
10497
10498 if (new == -1) goto badfmt;
10499 if (old != new) {
10500 if (new == 0) {
10501 stopAppendOnly();
10502 } else {
10503 if (startAppendOnly() == REDIS_ERR) {
10504 addReplySds(c,sdscatprintf(sdsempty(),
10505 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10506 decrRefCount(o);
10507 return;
10508 }
10509 }
10510 }
a34e0a25 10511 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10512 int vlen, j;
10513 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10514
10515 /* Perform sanity check before setting the new config:
10516 * - Even number of args
10517 * - Seconds >= 1, changes >= 0 */
10518 if (vlen & 1) {
10519 sdsfreesplitres(v,vlen);
10520 goto badfmt;
10521 }
10522 for (j = 0; j < vlen; j++) {
10523 char *eptr;
10524 long val;
10525
10526 val = strtoll(v[j], &eptr, 10);
10527 if (eptr[0] != '\0' ||
10528 ((j & 1) == 0 && val < 1) ||
10529 ((j & 1) == 1 && val < 0)) {
10530 sdsfreesplitres(v,vlen);
10531 goto badfmt;
10532 }
10533 }
10534 /* Finally set the new config */
10535 resetServerSaveParams();
10536 for (j = 0; j < vlen; j += 2) {
10537 time_t seconds;
10538 int changes;
10539
10540 seconds = strtoll(v[j],NULL,10);
10541 changes = strtoll(v[j+1],NULL,10);
10542 appendServerSaveParams(seconds, changes);
10543 }
10544 sdsfreesplitres(v,vlen);
500ece7c 10545 } else {
10546 addReplySds(c,sdscatprintf(sdsempty(),
10547 "-ERR not supported CONFIG parameter %s\r\n",
10548 (char*)c->argv[2]->ptr));
10549 decrRefCount(o);
10550 return;
10551 }
10552 decrRefCount(o);
10553 addReply(c,shared.ok);
a34e0a25 10554 return;
10555
10556badfmt: /* Bad format errors */
10557 addReplySds(c,sdscatprintf(sdsempty(),
10558 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10559 (char*)o->ptr,
10560 (char*)c->argv[2]->ptr));
10561 decrRefCount(o);
500ece7c 10562}
10563
10564static void configGetCommand(redisClient *c) {
10565 robj *o = getDecodedObject(c->argv[2]);
10566 robj *lenobj = createObject(REDIS_STRING,NULL);
10567 char *pattern = o->ptr;
10568 int matches = 0;
10569
10570 addReply(c,lenobj);
10571 decrRefCount(lenobj);
10572
10573 if (stringmatch(pattern,"dbfilename",0)) {
10574 addReplyBulkCString(c,"dbfilename");
10575 addReplyBulkCString(c,server.dbfilename);
10576 matches++;
10577 }
10578 if (stringmatch(pattern,"requirepass",0)) {
10579 addReplyBulkCString(c,"requirepass");
10580 addReplyBulkCString(c,server.requirepass);
10581 matches++;
10582 }
10583 if (stringmatch(pattern,"masterauth",0)) {
10584 addReplyBulkCString(c,"masterauth");
10585 addReplyBulkCString(c,server.masterauth);
10586 matches++;
10587 }
10588 if (stringmatch(pattern,"maxmemory",0)) {
10589 char buf[128];
10590
2e5eb04e 10591 ll2string(buf,128,server.maxmemory);
500ece7c 10592 addReplyBulkCString(c,"maxmemory");
10593 addReplyBulkCString(c,buf);
10594 matches++;
10595 }
2e5eb04e 10596 if (stringmatch(pattern,"timeout",0)) {
10597 char buf[128];
10598
10599 ll2string(buf,128,server.maxidletime);
10600 addReplyBulkCString(c,"timeout");
10601 addReplyBulkCString(c,buf);
10602 matches++;
10603 }
10604 if (stringmatch(pattern,"appendonly",0)) {
10605 addReplyBulkCString(c,"appendonly");
10606 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10607 matches++;
10608 }
38db9171 10609 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10610 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10611 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10612 matches++;
10613 }
1b677732 10614 if (stringmatch(pattern,"appendfsync",0)) {
10615 char *policy;
10616
10617 switch(server.appendfsync) {
10618 case APPENDFSYNC_NO: policy = "no"; break;
10619 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10620 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10621 default: policy = "unknown"; break; /* too harmless to panic */
10622 }
10623 addReplyBulkCString(c,"appendfsync");
10624 addReplyBulkCString(c,policy);
10625 matches++;
10626 }
a34e0a25 10627 if (stringmatch(pattern,"save",0)) {
10628 sds buf = sdsempty();
10629 int j;
10630
10631 for (j = 0; j < server.saveparamslen; j++) {
10632 buf = sdscatprintf(buf,"%ld %d",
10633 server.saveparams[j].seconds,
10634 server.saveparams[j].changes);
10635 if (j != server.saveparamslen-1)
10636 buf = sdscatlen(buf," ",1);
10637 }
10638 addReplyBulkCString(c,"save");
10639 addReplyBulkCString(c,buf);
10640 sdsfree(buf);
10641 matches++;
10642 }
500ece7c 10643 decrRefCount(o);
10644 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10645}
10646
10647static void configCommand(redisClient *c) {
10648 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10649 if (c->argc != 4) goto badarity;
10650 configSetCommand(c);
10651 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10652 if (c->argc != 3) goto badarity;
10653 configGetCommand(c);
10654 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10655 if (c->argc != 2) goto badarity;
10656 server.stat_numcommands = 0;
10657 server.stat_numconnections = 0;
10658 server.stat_expiredkeys = 0;
10659 server.stat_starttime = time(NULL);
10660 addReply(c,shared.ok);
10661 } else {
10662 addReplySds(c,sdscatprintf(sdsempty(),
10663 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10664 }
10665 return;
10666
10667badarity:
10668 addReplySds(c,sdscatprintf(sdsempty(),
10669 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10670 (char*) c->argv[1]->ptr));
10671}
10672
befec3cd 10673/* =========================== Pubsub implementation ======================== */
10674
ffc6b7f8 10675static void freePubsubPattern(void *p) {
10676 pubsubPattern *pat = p;
10677
10678 decrRefCount(pat->pattern);
10679 zfree(pat);
10680}
10681
10682static int listMatchPubsubPattern(void *a, void *b) {
10683 pubsubPattern *pa = a, *pb = b;
10684
10685 return (pa->client == pb->client) &&
bf028098 10686 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10687}
10688
10689/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10690 * 0 if the client was already subscribed to that channel. */
10691static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10692 struct dictEntry *de;
10693 list *clients = NULL;
10694 int retval = 0;
10695
ffc6b7f8 10696 /* Add the channel to the client -> channels hash table */
10697 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10698 retval = 1;
ffc6b7f8 10699 incrRefCount(channel);
10700 /* Add the client to the channel -> list of clients hash table */
10701 de = dictFind(server.pubsub_channels,channel);
befec3cd 10702 if (de == NULL) {
10703 clients = listCreate();
ffc6b7f8 10704 dictAdd(server.pubsub_channels,channel,clients);
10705 incrRefCount(channel);
befec3cd 10706 } else {
10707 clients = dictGetEntryVal(de);
10708 }
10709 listAddNodeTail(clients,c);
10710 }
10711 /* Notify the client */
10712 addReply(c,shared.mbulk3);
10713 addReply(c,shared.subscribebulk);
ffc6b7f8 10714 addReplyBulk(c,channel);
482b672d 10715 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10716 return retval;
10717}
10718
ffc6b7f8 10719/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10720 * 0 if the client was not subscribed to the specified channel. */
10721static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10722 struct dictEntry *de;
10723 list *clients;
10724 listNode *ln;
10725 int retval = 0;
10726
ffc6b7f8 10727 /* Remove the channel from the client -> channels hash table */
10728 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10729 we have in the hash tables. Protect it... */
ffc6b7f8 10730 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10731 retval = 1;
ffc6b7f8 10732 /* Remove the client from the channel -> clients list hash table */
10733 de = dictFind(server.pubsub_channels,channel);
befec3cd 10734 assert(de != NULL);
10735 clients = dictGetEntryVal(de);
10736 ln = listSearchKey(clients,c);
10737 assert(ln != NULL);
10738 listDelNode(clients,ln);
ff767a75 10739 if (listLength(clients) == 0) {
10740 /* Free the list and associated hash entry at all if this was
10741 * the latest client, so that it will be possible to abuse
ffc6b7f8 10742 * Redis PUBSUB creating millions of channels. */
10743 dictDelete(server.pubsub_channels,channel);
ff767a75 10744 }
befec3cd 10745 }
10746 /* Notify the client */
10747 if (notify) {
10748 addReply(c,shared.mbulk3);
10749 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10750 addReplyBulk(c,channel);
482b672d 10751 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10752 listLength(c->pubsub_patterns));
10753
10754 }
10755 decrRefCount(channel); /* it is finally safe to release it */
10756 return retval;
10757}
10758
10759/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10760static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10761 int retval = 0;
10762
10763 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10764 retval = 1;
10765 pubsubPattern *pat;
10766 listAddNodeTail(c->pubsub_patterns,pattern);
10767 incrRefCount(pattern);
10768 pat = zmalloc(sizeof(*pat));
10769 pat->pattern = getDecodedObject(pattern);
10770 pat->client = c;
10771 listAddNodeTail(server.pubsub_patterns,pat);
10772 }
10773 /* Notify the client */
10774 addReply(c,shared.mbulk3);
10775 addReply(c,shared.psubscribebulk);
10776 addReplyBulk(c,pattern);
482b672d 10777 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10778 return retval;
10779}
10780
10781/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10782 * 0 if the client was not subscribed to the specified channel. */
10783static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10784 listNode *ln;
10785 pubsubPattern pat;
10786 int retval = 0;
10787
10788 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10789 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10790 retval = 1;
10791 listDelNode(c->pubsub_patterns,ln);
10792 pat.client = c;
10793 pat.pattern = pattern;
10794 ln = listSearchKey(server.pubsub_patterns,&pat);
10795 listDelNode(server.pubsub_patterns,ln);
10796 }
10797 /* Notify the client */
10798 if (notify) {
10799 addReply(c,shared.mbulk3);
10800 addReply(c,shared.punsubscribebulk);
10801 addReplyBulk(c,pattern);
482b672d 10802 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10803 listLength(c->pubsub_patterns));
befec3cd 10804 }
ffc6b7f8 10805 decrRefCount(pattern);
befec3cd 10806 return retval;
10807}
10808
ffc6b7f8 10809/* Unsubscribe from all the channels. Return the number of channels the
10810 * client was subscribed from. */
10811static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10812 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10813 dictEntry *de;
10814 int count = 0;
10815
10816 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10817 robj *channel = dictGetEntryKey(de);
befec3cd 10818
ffc6b7f8 10819 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10820 }
10821 dictReleaseIterator(di);
10822 return count;
10823}
10824
ffc6b7f8 10825/* Unsubscribe from all the patterns. Return the number of patterns the
10826 * client was subscribed from. */
10827static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10828 listNode *ln;
10829 listIter li;
10830 int count = 0;
10831
10832 listRewind(c->pubsub_patterns,&li);
10833 while ((ln = listNext(&li)) != NULL) {
10834 robj *pattern = ln->value;
10835
10836 count += pubsubUnsubscribePattern(c,pattern,notify);
10837 }
10838 return count;
10839}
10840
befec3cd 10841/* Publish a message */
ffc6b7f8 10842static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10843 int receivers = 0;
10844 struct dictEntry *de;
ffc6b7f8 10845 listNode *ln;
10846 listIter li;
befec3cd 10847
ffc6b7f8 10848 /* Send to clients listening for that channel */
10849 de = dictFind(server.pubsub_channels,channel);
befec3cd 10850 if (de) {
10851 list *list = dictGetEntryVal(de);
10852 listNode *ln;
10853 listIter li;
10854
10855 listRewind(list,&li);
10856 while ((ln = listNext(&li)) != NULL) {
10857 redisClient *c = ln->value;
10858
10859 addReply(c,shared.mbulk3);
10860 addReply(c,shared.messagebulk);
ffc6b7f8 10861 addReplyBulk(c,channel);
befec3cd 10862 addReplyBulk(c,message);
10863 receivers++;
10864 }
10865 }
ffc6b7f8 10866 /* Send to clients listening to matching channels */
10867 if (listLength(server.pubsub_patterns)) {
10868 listRewind(server.pubsub_patterns,&li);
10869 channel = getDecodedObject(channel);
10870 while ((ln = listNext(&li)) != NULL) {
10871 pubsubPattern *pat = ln->value;
10872
10873 if (stringmatchlen((char*)pat->pattern->ptr,
10874 sdslen(pat->pattern->ptr),
10875 (char*)channel->ptr,
10876 sdslen(channel->ptr),0)) {
c8d0ea0e 10877 addReply(pat->client,shared.mbulk4);
10878 addReply(pat->client,shared.pmessagebulk);
10879 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10880 addReplyBulk(pat->client,channel);
10881 addReplyBulk(pat->client,message);
10882 receivers++;
10883 }
10884 }
10885 decrRefCount(channel);
10886 }
befec3cd 10887 return receivers;
10888}
10889
10890static void subscribeCommand(redisClient *c) {
10891 int j;
10892
10893 for (j = 1; j < c->argc; j++)
ffc6b7f8 10894 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10895}
10896
10897static void unsubscribeCommand(redisClient *c) {
10898 if (c->argc == 1) {
ffc6b7f8 10899 pubsubUnsubscribeAllChannels(c,1);
10900 return;
10901 } else {
10902 int j;
10903
10904 for (j = 1; j < c->argc; j++)
10905 pubsubUnsubscribeChannel(c,c->argv[j],1);
10906 }
10907}
10908
10909static void psubscribeCommand(redisClient *c) {
10910 int j;
10911
10912 for (j = 1; j < c->argc; j++)
10913 pubsubSubscribePattern(c,c->argv[j]);
10914}
10915
10916static void punsubscribeCommand(redisClient *c) {
10917 if (c->argc == 1) {
10918 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10919 return;
10920 } else {
10921 int j;
10922
10923 for (j = 1; j < c->argc; j++)
ffc6b7f8 10924 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10925 }
10926}
10927
10928static void publishCommand(redisClient *c) {
10929 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10930 addReplyLongLong(c,receivers);
befec3cd 10931}
10932
37ab76c9 10933/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10934 *
10935 * The implementation uses a per-DB hash table mapping keys to list of clients
10936 * WATCHing those keys, so that given a key that is going to be modified
10937 * we can mark all the associated clients as dirty.
10938 *
10939 * Also every client contains a list of WATCHed keys so that's possible to
10940 * un-watch such keys when the client is freed or when UNWATCH is called. */
10941
10942/* In the client->watched_keys list we need to use watchedKey structures
10943 * as in order to identify a key in Redis we need both the key name and the
10944 * DB */
10945typedef struct watchedKey {
10946 robj *key;
10947 redisDb *db;
10948} watchedKey;
10949
10950/* Watch for the specified key */
10951static void watchForKey(redisClient *c, robj *key) {
10952 list *clients = NULL;
10953 listIter li;
10954 listNode *ln;
10955 watchedKey *wk;
10956
10957 /* Check if we are already watching for this key */
10958 listRewind(c->watched_keys,&li);
10959 while((ln = listNext(&li))) {
10960 wk = listNodeValue(ln);
10961 if (wk->db == c->db && equalStringObjects(key,wk->key))
10962 return; /* Key already watched */
10963 }
10964 /* This key is not already watched in this DB. Let's add it */
10965 clients = dictFetchValue(c->db->watched_keys,key);
10966 if (!clients) {
10967 clients = listCreate();
10968 dictAdd(c->db->watched_keys,key,clients);
10969 incrRefCount(key);
10970 }
10971 listAddNodeTail(clients,c);
10972 /* Add the new key to the lits of keys watched by this client */
10973 wk = zmalloc(sizeof(*wk));
10974 wk->key = key;
10975 wk->db = c->db;
10976 incrRefCount(key);
10977 listAddNodeTail(c->watched_keys,wk);
10978}
10979
10980/* Unwatch all the keys watched by this client. To clean the EXEC dirty
10981 * flag is up to the caller. */
10982static void unwatchAllKeys(redisClient *c) {
10983 listIter li;
10984 listNode *ln;
10985
10986 if (listLength(c->watched_keys) == 0) return;
10987 listRewind(c->watched_keys,&li);
10988 while((ln = listNext(&li))) {
10989 list *clients;
10990 watchedKey *wk;
10991
10992 /* Lookup the watched key -> clients list and remove the client
10993 * from the list */
10994 wk = listNodeValue(ln);
10995 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10996 assert(clients != NULL);
10997 listDelNode(clients,listSearchKey(clients,c));
10998 /* Kill the entry at all if this was the only client */
10999 if (listLength(clients) == 0)
11000 dictDelete(wk->db->watched_keys, wk->key);
11001 /* Remove this watched key from the client->watched list */
11002 listDelNode(c->watched_keys,ln);
11003 decrRefCount(wk->key);
11004 zfree(wk);
11005 }
11006}
11007
ca3f830b 11008/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 11009 * next EXEC will fail. */
11010static void touchWatchedKey(redisDb *db, robj *key) {
11011 list *clients;
11012 listIter li;
11013 listNode *ln;
11014
11015 if (dictSize(db->watched_keys) == 0) return;
11016 clients = dictFetchValue(db->watched_keys, key);
11017 if (!clients) return;
11018
11019 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
11020 /* Check if we are already watching for this key */
11021 listRewind(clients,&li);
11022 while((ln = listNext(&li))) {
11023 redisClient *c = listNodeValue(ln);
11024
11025 c->flags |= REDIS_DIRTY_CAS;
11026 }
11027}
11028
9b30e1a2 11029/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
11030 * flush but will be deleted as effect of the flushing operation should
11031 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
11032 * a FLUSHALL operation (all the DBs flushed). */
11033static void touchWatchedKeysOnFlush(int dbid) {
11034 listIter li1, li2;
11035 listNode *ln;
11036
11037 /* For every client, check all the waited keys */
11038 listRewind(server.clients,&li1);
11039 while((ln = listNext(&li1))) {
11040 redisClient *c = listNodeValue(ln);
11041 listRewind(c->watched_keys,&li2);
11042 while((ln = listNext(&li2))) {
11043 watchedKey *wk = listNodeValue(ln);
11044
11045 /* For every watched key matching the specified DB, if the
11046 * key exists, mark the client as dirty, as the key will be
11047 * removed. */
11048 if (dbid == -1 || wk->db->id == dbid) {
09241813 11049 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
9b30e1a2 11050 c->flags |= REDIS_DIRTY_CAS;
11051 }
11052 }
11053 }
11054}
11055
37ab76c9 11056static void watchCommand(redisClient *c) {
11057 int j;
11058
6531c94d 11059 if (c->flags & REDIS_MULTI) {
11060 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11061 return;
11062 }
37ab76c9 11063 for (j = 1; j < c->argc; j++)
11064 watchForKey(c,c->argv[j]);
11065 addReply(c,shared.ok);
11066}
11067
11068static void unwatchCommand(redisClient *c) {
11069 unwatchAllKeys(c);
11070 c->flags &= (~REDIS_DIRTY_CAS);
11071 addReply(c,shared.ok);
11072}
11073
7f957c92 11074/* ================================= Debugging ============================== */
11075
ba798261 11076/* Compute the sha1 of string at 's' with 'len' bytes long.
11077 * The SHA1 is then xored againt the string pointed by digest.
11078 * Since xor is commutative, this operation is used in order to
11079 * "add" digests relative to unordered elements.
11080 *
11081 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11082static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11083 SHA1_CTX ctx;
11084 unsigned char hash[20], *s = ptr;
11085 int j;
11086
11087 SHA1Init(&ctx);
11088 SHA1Update(&ctx,s,len);
11089 SHA1Final(hash,&ctx);
11090
11091 for (j = 0; j < 20; j++)
11092 digest[j] ^= hash[j];
11093}
11094
11095static void xorObjectDigest(unsigned char *digest, robj *o) {
11096 o = getDecodedObject(o);
11097 xorDigest(digest,o->ptr,sdslen(o->ptr));
11098 decrRefCount(o);
11099}
11100
11101/* This function instead of just computing the SHA1 and xoring it
11102 * against diget, also perform the digest of "digest" itself and
11103 * replace the old value with the new one.
11104 *
11105 * So the final digest will be:
11106 *
11107 * digest = SHA1(digest xor SHA1(data))
11108 *
11109 * This function is used every time we want to preserve the order so
11110 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11111 *
11112 * Also note that mixdigest("foo") followed by mixdigest("bar")
11113 * will lead to a different digest compared to "fo", "obar".
11114 */
11115static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11116 SHA1_CTX ctx;
11117 char *s = ptr;
11118
11119 xorDigest(digest,s,len);
11120 SHA1Init(&ctx);
11121 SHA1Update(&ctx,digest,20);
11122 SHA1Final(digest,&ctx);
11123}
11124
11125static void mixObjectDigest(unsigned char *digest, robj *o) {
11126 o = getDecodedObject(o);
11127 mixDigest(digest,o->ptr,sdslen(o->ptr));
11128 decrRefCount(o);
11129}
11130
11131/* Compute the dataset digest. Since keys, sets elements, hashes elements
11132 * are not ordered, we use a trick: every aggregate digest is the xor
11133 * of the digests of their elements. This way the order will not change
11134 * the result. For list instead we use a feedback entering the output digest
11135 * as input in order to ensure that a different ordered list will result in
11136 * a different digest. */
11137static void computeDatasetDigest(unsigned char *final) {
11138 unsigned char digest[20];
11139 char buf[128];
11140 dictIterator *di = NULL;
11141 dictEntry *de;
11142 int j;
11143 uint32_t aux;
11144
11145 memset(final,0,20); /* Start with a clean result */
11146
11147 for (j = 0; j < server.dbnum; j++) {
11148 redisDb *db = server.db+j;
11149
11150 if (dictSize(db->dict) == 0) continue;
11151 di = dictGetIterator(db->dict);
11152
11153 /* hash the DB id, so the same dataset moved in a different
11154 * DB will lead to a different digest */
11155 aux = htonl(j);
11156 mixDigest(final,&aux,sizeof(aux));
11157
11158 /* Iterate this DB writing every entry */
11159 while((de = dictNext(di)) != NULL) {
09241813 11160 sds key;
11161 robj *keyobj, *o;
ba798261 11162 time_t expiretime;
11163
11164 memset(digest,0,20); /* This key-val digest */
11165 key = dictGetEntryKey(de);
09241813 11166 keyobj = createStringObject(key,sdslen(key));
11167
11168 mixDigest(digest,key,sdslen(key));
11169
11170 /* Make sure the key is loaded if VM is active */
11171 o = lookupKeyRead(db,keyobj);
cbae1d34 11172
ba798261 11173 aux = htonl(o->type);
11174 mixDigest(digest,&aux,sizeof(aux));
09241813 11175 expiretime = getExpire(db,keyobj);
ba798261 11176
11177 /* Save the key and associated value */
11178 if (o->type == REDIS_STRING) {
11179 mixObjectDigest(digest,o);
11180 } else if (o->type == REDIS_LIST) {
003f0840
PN
11181 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11182 listTypeEntry entry;
11183 while(listTypeNext(li,&entry)) {
11184 robj *eleobj = listTypeGet(&entry);
ba798261 11185 mixObjectDigest(digest,eleobj);
dc845730 11186 decrRefCount(eleobj);
ba798261 11187 }
003f0840 11188 listTypeReleaseIterator(li);
ba798261 11189 } else if (o->type == REDIS_SET) {
11190 dict *set = o->ptr;
11191 dictIterator *di = dictGetIterator(set);
11192 dictEntry *de;
11193
11194 while((de = dictNext(di)) != NULL) {
11195 robj *eleobj = dictGetEntryKey(de);
11196
11197 xorObjectDigest(digest,eleobj);
11198 }
11199 dictReleaseIterator(di);
11200 } else if (o->type == REDIS_ZSET) {
11201 zset *zs = o->ptr;
11202 dictIterator *di = dictGetIterator(zs->dict);
11203 dictEntry *de;
11204
11205 while((de = dictNext(di)) != NULL) {
11206 robj *eleobj = dictGetEntryKey(de);
11207 double *score = dictGetEntryVal(de);
11208 unsigned char eledigest[20];
11209
11210 snprintf(buf,sizeof(buf),"%.17g",*score);
11211 memset(eledigest,0,20);
11212 mixObjectDigest(eledigest,eleobj);
11213 mixDigest(eledigest,buf,strlen(buf));
11214 xorDigest(digest,eledigest,20);
11215 }
11216 dictReleaseIterator(di);
11217 } else if (o->type == REDIS_HASH) {
d1578a33 11218 hashTypeIterator *hi;
ba798261 11219 robj *obj;
11220
d1578a33
PN
11221 hi = hashTypeInitIterator(o);
11222 while (hashTypeNext(hi) != REDIS_ERR) {
ba798261 11223 unsigned char eledigest[20];
11224
11225 memset(eledigest,0,20);
d1578a33 11226 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
ba798261 11227 mixObjectDigest(eledigest,obj);
11228 decrRefCount(obj);
d1578a33 11229 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
ba798261 11230 mixObjectDigest(eledigest,obj);
11231 decrRefCount(obj);
11232 xorDigest(digest,eledigest,20);
11233 }
d1578a33 11234 hashTypeReleaseIterator(hi);
ba798261 11235 } else {
11236 redisPanic("Unknown object type");
11237 }
ba798261 11238 /* If the key has an expire, add it to the mix */
11239 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11240 /* We can finally xor the key-val digest to the final digest */
11241 xorDigest(final,digest,20);
09241813 11242 decrRefCount(keyobj);
ba798261 11243 }
11244 dictReleaseIterator(di);
11245 }
11246}
11247
7f957c92 11248static void debugCommand(redisClient *c) {
11249 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11250 *((char*)-1) = 'x';
210e29f7 11251 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11252 if (rdbSave(server.dbfilename) != REDIS_OK) {
11253 addReply(c,shared.err);
11254 return;
11255 }
11256 emptyDb();
11257 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11258 addReply(c,shared.err);
11259 return;
11260 }
11261 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11262 addReply(c,shared.ok);
71c2b467 11263 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11264 emptyDb();
11265 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11266 addReply(c,shared.err);
11267 return;
11268 }
11269 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11270 addReply(c,shared.ok);
333298da 11271 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
09241813 11272 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11273 robj *val;
333298da 11274
11275 if (!de) {
11276 addReply(c,shared.nokeyerr);
11277 return;
11278 }
333298da 11279 val = dictGetEntryVal(de);
560db612 11280 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11281 val->storage == REDIS_VM_SWAPPING)) {
07efaf74 11282 char *strenc;
11283 char buf[128];
11284
11285 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11286 strenc = strencoding[val->encoding];
11287 } else {
11288 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11289 strenc = buf;
11290 }
ace06542 11291 addReplySds(c,sdscatprintf(sdsempty(),
09241813 11292 "+Value at:%p refcount:%d "
07efaf74 11293 "encoding:%s serializedlength:%lld\r\n",
09241813 11294 (void*)val, val->refcount,
07efaf74 11295 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 11296 } else {
560db612 11297 vmpointer *vp = (vmpointer*) val;
ace06542 11298 addReplySds(c,sdscatprintf(sdsempty(),
09241813 11299 "+Value swapped at: page %llu "
ace06542 11300 "using %llu pages\r\n",
09241813 11301 (unsigned long long) vp->page,
560db612 11302 (unsigned long long) vp->usedpages));
ace06542 11303 }
78ebe4c8 11304 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11305 lookupKeyRead(c->db,c->argv[2]);
11306 addReply(c,shared.ok);
7d30035d 11307 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
09241813 11308 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11309 robj *val;
560db612 11310 vmpointer *vp;
7d30035d 11311
11312 if (!server.vm_enabled) {
11313 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11314 return;
11315 }
11316 if (!de) {
11317 addReply(c,shared.nokeyerr);
11318 return;
11319 }
7d30035d 11320 val = dictGetEntryVal(de);
4ef8de8a 11321 /* Swap it */
560db612 11322 if (val->storage != REDIS_VM_MEMORY) {
7d30035d 11323 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
560db612 11324 } else if (val->refcount != 1) {
11325 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11326 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11327 dictGetEntryVal(de) = vp;
7d30035d 11328 addReply(c,shared.ok);
11329 } else {
11330 addReply(c,shared.err);
11331 }
59305dc7 11332 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11333 long keys, j;
11334 robj *key, *val;
11335 char buf[128];
11336
11337 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11338 return;
11339 for (j = 0; j < keys; j++) {
11340 snprintf(buf,sizeof(buf),"key:%lu",j);
11341 key = createStringObject(buf,strlen(buf));
11342 if (lookupKeyRead(c->db,key) != NULL) {
11343 decrRefCount(key);
11344 continue;
11345 }
11346 snprintf(buf,sizeof(buf),"value:%lu",j);
11347 val = createStringObject(buf,strlen(buf));
09241813 11348 dbAdd(c->db,key,val);
11349 decrRefCount(key);
59305dc7 11350 }
11351 addReply(c,shared.ok);
ba798261 11352 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11353 unsigned char digest[20];
11354 sds d = sdsnew("+");
11355 int j;
11356
11357 computeDatasetDigest(digest);
11358 for (j = 0; j < 20; j++)
11359 d = sdscatprintf(d, "%02x",digest[j]);
11360
11361 d = sdscatlen(d,"\r\n",2);
11362 addReplySds(c,d);
7f957c92 11363 } else {
333298da 11364 addReplySds(c,sdsnew(
bdcb92f2 11365 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 11366 }
11367}
56906eef 11368
6c96ba7d 11369static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 11370 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 11371 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 11372#ifdef HAVE_BACKTRACE
11373 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11374 *((char*)-1) = 'x';
11375#endif
11376}
11377
c651fd9e 11378static void _redisPanic(char *msg, char *file, int line) {
11379 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 11380 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 11381#ifdef HAVE_BACKTRACE
11382 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11383 *((char*)-1) = 'x';
11384#endif
11385}
11386
bcfc686d 11387/* =================================== Main! ================================ */
56906eef 11388
bcfc686d 11389#ifdef __linux__
11390int linuxOvercommitMemoryValue(void) {
11391 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11392 char buf[64];
56906eef 11393
bcfc686d 11394 if (!fp) return -1;
11395 if (fgets(buf,64,fp) == NULL) {
11396 fclose(fp);
11397 return -1;
11398 }
11399 fclose(fp);
56906eef 11400
bcfc686d 11401 return atoi(buf);
11402}
11403
11404void linuxOvercommitMemoryWarning(void) {
11405 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 11406 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 11407 }
11408}
11409#endif /* __linux__ */
11410
11411static void daemonize(void) {
11412 int fd;
11413 FILE *fp;
11414
11415 if (fork() != 0) exit(0); /* parent exits */
11416 setsid(); /* create a new session */
11417
11418 /* Every output goes to /dev/null. If Redis is daemonized but
11419 * the 'logfile' is set to 'stdout' in the configuration file
11420 * it will not log at all. */
11421 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11422 dup2(fd, STDIN_FILENO);
11423 dup2(fd, STDOUT_FILENO);
11424 dup2(fd, STDERR_FILENO);
11425 if (fd > STDERR_FILENO) close(fd);
11426 }
11427 /* Try to write the pid file */
11428 fp = fopen(server.pidfile,"w");
11429 if (fp) {
11430 fprintf(fp,"%d\n",getpid());
11431 fclose(fp);
56906eef 11432 }
56906eef 11433}
11434
42ab0172 11435static void version() {
8a3b0d2d 11436 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11437 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
11438 exit(0);
11439}
11440
723fb69b
AO
11441static void usage() {
11442 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 11443 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
11444 exit(1);
11445}
11446
bcfc686d 11447int main(int argc, char **argv) {
9651a787 11448 time_t start;
11449
bcfc686d 11450 initServerConfig();
1a132bbc 11451 sortCommandTable();
bcfc686d 11452 if (argc == 2) {
44efe66e 11453 if (strcmp(argv[1], "-v") == 0 ||
11454 strcmp(argv[1], "--version") == 0) version();
11455 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 11456 resetServerSaveParams();
11457 loadServerConfig(argv[1]);
723fb69b
AO
11458 } else if ((argc > 2)) {
11459 usage();
bcfc686d 11460 } else {
11461 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11462 }
bcfc686d 11463 if (server.daemonize) daemonize();
71c54b21 11464 initServer();
bcfc686d 11465 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11466#ifdef __linux__
11467 linuxOvercommitMemoryWarning();
11468#endif
9651a787 11469 start = time(NULL);
bcfc686d 11470 if (server.appendonly) {
11471 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 11472 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 11473 } else {
11474 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 11475 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 11476 }
bcfc686d 11477 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 11478 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 11479 aeMain(server.el);
11480 aeDeleteEventLoop(server.el);
11481 return 0;
11482}
11483
11484/* ============================= Backtrace support ========================= */
11485
11486#ifdef HAVE_BACKTRACE
11487static char *findFuncName(void *pointer, unsigned long *offset);
11488
56906eef 11489static void *getMcontextEip(ucontext_t *uc) {
11490#if defined(__FreeBSD__)
11491 return (void*) uc->uc_mcontext.mc_eip;
11492#elif defined(__dietlibc__)
11493 return (void*) uc->uc_mcontext.eip;
06db1f50 11494#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 11495 #if __x86_64__
11496 return (void*) uc->uc_mcontext->__ss.__rip;
11497 #else
56906eef 11498 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 11499 #endif
06db1f50 11500#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 11501 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 11502 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 11503 #else
11504 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 11505 #endif
54bac49d 11506#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11507 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11508#elif defined(__ia64__) /* Linux IA64 */
11509 return (void*) uc->uc_mcontext.sc_ip;
11510#else
11511 return NULL;
56906eef 11512#endif
11513}
11514
11515static void segvHandler(int sig, siginfo_t *info, void *secret) {
11516 void *trace[100];
11517 char **messages = NULL;
11518 int i, trace_size = 0;
11519 unsigned long offset=0;
56906eef 11520 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11521 sds infostring;
56906eef 11522 REDIS_NOTUSED(info);
11523
11524 redisLog(REDIS_WARNING,
11525 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11526 infostring = genRedisInfoString();
11527 redisLog(REDIS_WARNING, "%s",infostring);
11528 /* It's not safe to sdsfree() the returned string under memory
11529 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11530
56906eef 11531 trace_size = backtrace(trace, 100);
de96dbfe 11532 /* overwrite sigaction with caller's address */
b91cf5ef 11533 if (getMcontextEip(uc) != NULL) {
11534 trace[1] = getMcontextEip(uc);
11535 }
56906eef 11536 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11537
d76412d1 11538 for (i=1; i<trace_size; ++i) {
56906eef 11539 char *fn = findFuncName(trace[i], &offset), *p;
11540
11541 p = strchr(messages[i],'+');
11542 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11543 redisLog(REDIS_WARNING,"%s", messages[i]);
11544 } else {
11545 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11546 }
11547 }
b177fd30 11548 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11549 _exit(0);
fe3bbfbe 11550}
56906eef 11551
fab43727 11552static void sigtermHandler(int sig) {
11553 REDIS_NOTUSED(sig);
b58ba105 11554
fab43727 11555 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11556 server.shutdown_asap = 1;
b58ba105
AM
11557}
11558
56906eef 11559static void setupSigSegvAction(void) {
11560 struct sigaction act;
11561
11562 sigemptyset (&act.sa_mask);
11563 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11564 * is used. Otherwise, sa_handler is used */
11565 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11566 act.sa_sigaction = segvHandler;
11567 sigaction (SIGSEGV, &act, NULL);
11568 sigaction (SIGBUS, &act, NULL);
12fea928 11569 sigaction (SIGFPE, &act, NULL);
11570 sigaction (SIGILL, &act, NULL);
11571 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11572
11573 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11574 act.sa_handler = sigtermHandler;
b58ba105 11575 sigaction (SIGTERM, &act, NULL);
e65fdc78 11576 return;
56906eef 11577}
e65fdc78 11578
bcfc686d 11579#include "staticsymbols.h"
11580/* This function try to convert a pointer into a function name. It's used in
11581 * oreder to provide a backtrace under segmentation fault that's able to
11582 * display functions declared as static (otherwise the backtrace is useless). */
11583static char *findFuncName(void *pointer, unsigned long *offset){
11584 int i, ret = -1;
11585 unsigned long off, minoff = 0;
ed9b544e 11586
bcfc686d 11587 /* Try to match against the Symbol with the smallest offset */
11588 for (i=0; symsTable[i].pointer; i++) {
11589 unsigned long lp = (unsigned long) pointer;
0bc03378 11590
bcfc686d 11591 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11592 off=lp-symsTable[i].pointer;
11593 if (ret < 0 || off < minoff) {
11594 minoff=off;
11595 ret=i;
11596 }
11597 }
0bc03378 11598 }
bcfc686d 11599 if (ret == -1) return NULL;
11600 *offset = minoff;
11601 return symsTable[ret].name;
0bc03378 11602}
bcfc686d 11603#else /* HAVE_BACKTRACE */
11604static void setupSigSegvAction(void) {
0bc03378 11605}
bcfc686d 11606#endif /* HAVE_BACKTRACE */
0bc03378 11607
ed9b544e 11608
ed9b544e 11609
bcfc686d 11610/* The End */
11611
11612
ed9b544e 11613