]> git.saurik.com Git - redis.git/blame - redis.c
expand the dictionary of the target set to the right size when converting from intset
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
c7d9d662 78#include "ziplist.h" /* Compact list data structure */
d0b58d53 79#include "intset.h" /* Compact integer set structure */
ba798261 80#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 81#include "release.h" /* Release and/or git repository information */
ed9b544e 82
83/* Error codes */
84#define REDIS_OK 0
85#define REDIS_ERR -1
86
87/* Static server configuration */
88#define REDIS_SERVERPORT 6379 /* TCP port */
89#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 90#define REDIS_IOBUF_LEN 1024
ed9b544e 91#define REDIS_LOADBUF_LEN 1024
248ea310 92#define REDIS_STATIC_ARGS 8
ed9b544e 93#define REDIS_DEFAULT_DBNUM 16
94#define REDIS_CONFIGLINE_MAX 1024
95#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
96#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 97#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 98#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 99#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
100
101/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
102#define REDIS_WRITEV_THRESHOLD 3
103/* Max number of iovecs used for each writev call */
104#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 105
106/* Hash table parameters */
107#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 108
109/* Command flags */
3fd78bcd 110#define REDIS_CMD_BULK 1 /* Bulk write command */
111#define REDIS_CMD_INLINE 2 /* Inline command */
112/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
113 this flags will return an error when the 'maxmemory' option is set in the
114 config file and the server is using more than maxmemory bytes of memory.
115 In short this commands are denied on low memory conditions. */
116#define REDIS_CMD_DENYOOM 4
4005fef1 117#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 118
119/* Object types */
120#define REDIS_STRING 0
121#define REDIS_LIST 1
122#define REDIS_SET 2
1812e024 123#define REDIS_ZSET 3
124#define REDIS_HASH 4
560db612 125#define REDIS_VMPOINTER 8
f78fd11b 126
5234952b 127/* Objects encoding. Some kind of objects like Strings and Hashes can be
128 * internally represented in multiple ways. The 'encoding' field of the object
129 * is set to one of this fields for this object. */
c7d9d662
PN
130#define REDIS_ENCODING_RAW 0 /* Raw representation */
131#define REDIS_ENCODING_INT 1 /* Encoded as integer */
132#define REDIS_ENCODING_HT 2 /* Encoded as hash table */
133#define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
134#define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
135#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
d0b58d53 136#define REDIS_ENCODING_INTSET 6 /* Encoded as intset */
942a3961 137
07efaf74 138static char* strencoding[] = {
d0b58d53 139 "raw", "int", "hashtable", "zipmap", "list", "ziplist", "intset"
07efaf74 140};
141
f78fd11b 142/* Object types only used for dumping to disk */
bb32ede5 143#define REDIS_EXPIRETIME 253
ed9b544e 144#define REDIS_SELECTDB 254
145#define REDIS_EOF 255
146
f78fd11b 147/* Defines related to the dump file format. To store 32 bits lengths for short
148 * keys requires a lot of space, so we check the most significant 2 bits of
149 * the first byte to interpreter the length:
150 *
151 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
152 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
153 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 154 * 11|000000 this means: specially encoded object will follow. The six bits
155 * number specify the kind of object that follows.
156 * See the REDIS_RDB_ENC_* defines.
f78fd11b 157 *
10c43610 158 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
159 * values, will fit inside. */
f78fd11b 160#define REDIS_RDB_6BITLEN 0
161#define REDIS_RDB_14BITLEN 1
162#define REDIS_RDB_32BITLEN 2
17be1a4a 163#define REDIS_RDB_ENCVAL 3
f78fd11b 164#define REDIS_RDB_LENERR UINT_MAX
165
a4d1ba9a 166/* When a length of a string object stored on disk has the first two bits
167 * set, the remaining two bits specify a special encoding for the object
168 * accordingly to the following defines: */
169#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
170#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
171#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 172#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 173
75680a3c 174/* Virtual memory object->where field. */
175#define REDIS_VM_MEMORY 0 /* The object is on memory */
176#define REDIS_VM_SWAPPED 1 /* The object is on disk */
177#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
178#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
179
06224fec 180/* Virtual memory static configuration stuff.
181 * Check vmFindContiguousPages() to know more about this magic numbers. */
182#define REDIS_VM_MAX_NEAR_PAGES 65536
183#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 184#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 185#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 186/* The following is the *percentage* of completed I/O jobs to process when the
187 * handelr is called. While Virtual Memory I/O operations are performed by
188 * threads, this operations must be processed by the main thread when completed
189 * in order to take effect. */
c953f24b 190#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 191
ed9b544e 192/* Client flags */
d5d55fc3 193#define REDIS_SLAVE 1 /* This client is a slave server */
194#define REDIS_MASTER 2 /* This client is a master server */
195#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
196#define REDIS_MULTI 8 /* This client is in a MULTI context */
197#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
198#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 199#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 200
40d224a9 201/* Slave replication state - slave side */
ed9b544e 202#define REDIS_REPL_NONE 0 /* No active replication */
203#define REDIS_REPL_CONNECT 1 /* Must connect to master */
204#define REDIS_REPL_CONNECTED 2 /* Connected to master */
205
40d224a9 206/* Slave replication state - from the point of view of master
207 * Note that in SEND_BULK and ONLINE state the slave receives new updates
208 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
209 * to start the next background saving in order to send updates to it. */
210#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
211#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
212#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
213#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
214
ed9b544e 215/* List related stuff */
216#define REDIS_HEAD 0
217#define REDIS_TAIL 1
218
219/* Sort operations */
220#define REDIS_SORT_GET 0
443c6409 221#define REDIS_SORT_ASC 1
222#define REDIS_SORT_DESC 2
ed9b544e 223#define REDIS_SORTKEY_MAX 1024
224
225/* Log levels */
226#define REDIS_DEBUG 0
f870935d 227#define REDIS_VERBOSE 1
228#define REDIS_NOTICE 2
229#define REDIS_WARNING 3
ed9b544e 230
231/* Anti-warning macro... */
232#define REDIS_NOTUSED(V) ((void) V)
233
6b47e12e 234#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
235#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 236
48f0308a 237/* Append only defines */
238#define APPENDFSYNC_NO 0
239#define APPENDFSYNC_ALWAYS 1
240#define APPENDFSYNC_EVERYSEC 2
241
d0686e07 242/* Zip structure related defaults */
cbba7dd7 243#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
244#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
d0686e07
PN
245#define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
246#define REDIS_LIST_MAX_ZIPLIST_VALUE 32
70ff3511 247#define REDIS_SET_MAX_INTSET_ENTRIES 4096
cbba7dd7 248
dfc5e96c 249/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 250#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 251#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 252static void _redisAssert(char *estr, char *file, int line);
c651fd9e 253static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 254
ed9b544e 255/*================================= Data types ============================== */
256
257/* A redis object, that is a type able to hold a string / list / set */
75680a3c 258
75680a3c 259/* The actual Redis Object */
ed9b544e 260typedef struct redisObject {
560db612 261 unsigned type:4;
262 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
263 unsigned encoding:4;
264 unsigned lru:22; /* lru time (relative to server.lruclock) */
ed9b544e 265 int refcount;
560db612 266 void *ptr;
75680a3c 267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
ed9b544e 271} robj;
272
560db612 273/* The VM pointer structure - identifies an object in the swap file.
274 *
275 * This object is stored in place of the value
276 * object in the main key->value hash table representing a database.
277 * Note that the first fields (type, storage) are the same as the redisObject
278 * structure so that vmPointer strucuters can be accessed even when casted
279 * as redisObject structures.
280 *
281 * This is useful as we don't know if a value object is or not on disk, but we
169dd6b7 282 * are always able to read obj->storage to check this. For vmPointer
560db612 283 * structures "type" is set to REDIS_VMPOINTER (even if without this field
284 * is still possible to check the kind of object from the value of 'storage').*/
285typedef struct vmPointer {
286 unsigned type:4;
287 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
288 unsigned notused:26;
289 unsigned int vtype; /* type of the object stored in the swap file */
290 off_t page; /* the page at witch the object is stored on disk */
291 off_t usedpages; /* number of pages used on disk */
292} vmpointer;
293
dfc5e96c 294/* Macro used to initalize a Redis object allocated on the stack.
295 * Note that this macro is taken near the structure definition to make sure
296 * we'll update it when the structure is changed, to avoid bugs like
297 * bug #85 introduced exactly in this way. */
298#define initStaticStringObject(_var,_ptr) do { \
299 _var.refcount = 1; \
300 _var.type = REDIS_STRING; \
301 _var.encoding = REDIS_ENCODING_RAW; \
302 _var.ptr = _ptr; \
560db612 303 _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 304} while(0);
305
3305306f 306typedef struct redisDb {
4409877e 307 dict *dict; /* The keyspace for this DB */
308 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 309 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 310 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 311 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 312 int id;
313} redisDb;
314
6e469882 315/* Client MULTI/EXEC state */
316typedef struct multiCmd {
317 robj **argv;
318 int argc;
319 struct redisCommand *cmd;
320} multiCmd;
321
322typedef struct multiState {
323 multiCmd *commands; /* Array of MULTI commands */
324 int count; /* Total number of MULTI commands */
325} multiState;
326
ed9b544e 327/* With multiplexing we need to take per-clinet state.
328 * Clients are taken in a liked list. */
329typedef struct redisClient {
330 int fd;
3305306f 331 redisDb *db;
ed9b544e 332 int dictid;
333 sds querybuf;
e8a74421 334 robj **argv, **mbargv;
335 int argc, mbargc;
40d224a9 336 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 337 int multibulk; /* multi bulk command format active */
ed9b544e 338 list *reply;
339 int sentlen;
340 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 341 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 342 int slaveseldb; /* slave selected db, if this client is a slave */
343 int authenticated; /* when requirepass is non-NULL */
344 int replstate; /* replication state if this is a slave */
345 int repldbfd; /* replication DB file descriptor */
6e469882 346 long repldboff; /* replication DB file offset */
40d224a9 347 off_t repldbsize; /* replication DB file size */
6e469882 348 multiState mstate; /* MULTI/EXEC state */
37ab76c9 349 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 350 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 351 int blocking_keys_num; /* Number of blocking keys */
4409877e 352 time_t blockingto; /* Blocking operation timeout. If UNIX current time
353 * is >= blockingto then the operation timed out. */
92f8e882 354 list *io_keys; /* Keys this client is waiting to be loaded from the
355 * swap file in order to continue. */
37ab76c9 356 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 357 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
358 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 359} redisClient;
360
361struct saveparam {
362 time_t seconds;
363 int changes;
364};
365
366/* Global server state structure */
367struct redisServer {
368 int port;
369 int fd;
3305306f 370 redisDb *db;
ed9b544e 371 long long dirty; /* changes to DB from the last save */
372 list *clients;
87eca727 373 list *slaves, *monitors;
ed9b544e 374 char neterr[ANET_ERR_LEN];
375 aeEventLoop *el;
376 int cronloops; /* number of times the cron function run */
377 list *objfreelist; /* A list of freed objects to avoid malloc() */
378 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 379 /* Fields used only for stats */
380 time_t stat_starttime; /* server start time */
381 long long stat_numcommands; /* number of processed commands */
382 long long stat_numconnections; /* number of connections received */
2a6a2ed1 383 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 384 /* Configuration */
385 int verbosity;
386 int glueoutputbuf;
387 int maxidletime;
388 int dbnum;
389 int daemonize;
44b38ef4 390 int appendonly;
48f0308a 391 int appendfsync;
38db9171 392 int no_appendfsync_on_rewrite;
fab43727 393 int shutdown_asap;
48f0308a 394 time_t lastfsync;
44b38ef4 395 int appendfd;
396 int appendseldb;
ed329fcf 397 char *pidfile;
9f3c422c 398 pid_t bgsavechildpid;
9d65a1bb 399 pid_t bgrewritechildpid;
400 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 401 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 402 struct saveparam *saveparams;
403 int saveparamslen;
404 char *logfile;
405 char *bindaddr;
406 char *dbfilename;
44b38ef4 407 char *appendfilename;
abcb223e 408 char *requirepass;
121f70cf 409 int rdbcompression;
8ca3e9d1 410 int activerehashing;
ed9b544e 411 /* Replication related */
412 int isslave;
d0ccebcf 413 char *masterauth;
ed9b544e 414 char *masterhost;
415 int masterport;
40d224a9 416 redisClient *master; /* client that is master for this slave */
ed9b544e 417 int replstate;
285add55 418 unsigned int maxclients;
4ef8de8a 419 unsigned long long maxmemory;
d5d55fc3 420 unsigned int blpop_blocked_clients;
421 unsigned int vm_blocked_clients;
ed9b544e 422 /* Sort parameters - qsort_r() is only available under BSD so we
423 * have to take this state global, in order to pass it to sortCompare() */
424 int sort_desc;
425 int sort_alpha;
426 int sort_bypattern;
75680a3c 427 /* Virtual memory configuration */
428 int vm_enabled;
054e426d 429 char *vm_swap_file;
75680a3c 430 off_t vm_page_size;
431 off_t vm_pages;
4ef8de8a 432 unsigned long long vm_max_memory;
d0686e07 433 /* Zip structure config */
cbba7dd7 434 size_t hash_max_zipmap_entries;
435 size_t hash_max_zipmap_value;
d0686e07
PN
436 size_t list_max_ziplist_entries;
437 size_t list_max_ziplist_value;
70ff3511 438 size_t set_max_intset_entries;
75680a3c 439 /* Virtual memory state */
440 FILE *vm_fp;
441 int vm_fd;
442 off_t vm_next_page; /* Next probably empty page */
443 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 444 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 445 time_t unixtime; /* Unix time sampled every second. */
92f8e882 446 /* Virtual memory I/O threads stuff */
92f8e882 447 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 448 * put the result of the operation in the io_done list. While the
449 * job is being processed, it's put on io_processing queue. */
450 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
451 list *io_processing; /* List of VM I/O jobs being processed */
452 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 453 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 454 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 455 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
456 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 457 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 458 int io_active_threads; /* Number of running I/O threads */
459 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 460 /* Our main thread is blocked on the event loop, locking for sockets ready
461 * to be read or written, so when a threaded I/O operation is ready to be
462 * processed by the main thread, the I/O thread will use a unix pipe to
463 * awake the main thread. The followings are the two pipe FDs. */
464 int io_ready_pipe_read;
465 int io_ready_pipe_write;
7d98e08c 466 /* Virtual memory stats */
467 unsigned long long vm_stats_used_pages;
468 unsigned long long vm_stats_swapped_objects;
469 unsigned long long vm_stats_swapouts;
470 unsigned long long vm_stats_swapins;
befec3cd 471 /* Pubsub */
ffc6b7f8 472 dict *pubsub_channels; /* Map channels to list of subscribed clients */
473 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 474 /* Misc */
b9bc0eef 475 FILE *devnull;
560db612 476 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
477 unsigned lruclock_padding:10;
ed9b544e 478};
479
ffc6b7f8 480typedef struct pubsubPattern {
481 redisClient *client;
482 robj *pattern;
483} pubsubPattern;
484
ed9b544e 485typedef void redisCommandProc(redisClient *c);
ca1788b5 486typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 487struct redisCommand {
488 char *name;
489 redisCommandProc *proc;
490 int arity;
491 int flags;
76583ea4
PN
492 /* Use a function to determine which keys need to be loaded
493 * in the background prior to executing this command. Takes precedence
494 * over vm_firstkey and others, ignored when NULL */
ca1788b5 495 redisVmPreloadProc *vm_preload_proc;
7c775e09 496 /* What keys should be loaded in background when calling this command? */
497 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
498 int vm_lastkey; /* THe last argument that's a key */
499 int vm_keystep; /* The step between first and last key */
ed9b544e 500};
501
de96dbfe 502struct redisFunctionSym {
503 char *name;
56906eef 504 unsigned long pointer;
de96dbfe 505};
506
ed9b544e 507typedef struct _redisSortObject {
508 robj *obj;
509 union {
510 double score;
511 robj *cmpobj;
512 } u;
513} redisSortObject;
514
515typedef struct _redisSortOperation {
516 int type;
517 robj *pattern;
518} redisSortOperation;
519
6b47e12e 520/* ZSETs use a specialized version of Skiplists */
521
522typedef struct zskiplistNode {
523 struct zskiplistNode **forward;
e3870fab 524 struct zskiplistNode *backward;
912b9165 525 unsigned int *span;
6b47e12e 526 double score;
527 robj *obj;
528} zskiplistNode;
529
530typedef struct zskiplist {
e3870fab 531 struct zskiplistNode *header, *tail;
d13f767c 532 unsigned long length;
6b47e12e 533 int level;
534} zskiplist;
535
1812e024 536typedef struct zset {
537 dict *dict;
6b47e12e 538 zskiplist *zsl;
1812e024 539} zset;
540
6b47e12e 541/* Our shared "common" objects */
542
05df7621 543#define REDIS_SHARED_INTEGERS 10000
ed9b544e 544struct sharedObjectsStruct {
c937aa89 545 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 546 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 547 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
548 *outofrangeerr, *plus,
ed9b544e 549 *select0, *select1, *select2, *select3, *select4,
befec3cd 550 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 551 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
552 *mbulk4, *psubscribebulk, *punsubscribebulk,
553 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 554} shared;
555
a7866db6 556/* Global vars that are actally used as constants. The following double
557 * values are used for double on-disk serialization, and are initialized
558 * at runtime to avoid strange compiler optimizations. */
559
560static double R_Zero, R_PosInf, R_NegInf, R_Nan;
561
92f8e882 562/* VM threaded I/O request message */
b9bc0eef 563#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
564#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
565#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 566typedef struct iojob {
996cb5f7 567 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 568 redisDb *db;/* Redis database */
92f8e882 569 robj *key; /* This I/O request is about swapping this key */
560db612 570 robj *id; /* Unique identifier of this job:
571 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
572 vmpointer objct for REDIS_IOREQ_LOAD. */
b9bc0eef 573 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 574 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
575 off_t page; /* Swap page where to read/write the object */
248ea310 576 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 577 int canceled; /* True if this command was canceled by blocking side of VM */
578 pthread_t thread; /* ID of the thread processing this entry */
579} iojob;
92f8e882 580
ed9b544e 581/*================================ Prototypes =============================== */
582
583static void freeStringObject(robj *o);
584static void freeListObject(robj *o);
585static void freeSetObject(robj *o);
586static void decrRefCount(void *o);
587static robj *createObject(int type, void *ptr);
588static void freeClient(redisClient *c);
f78fd11b 589static int rdbLoad(char *filename);
ed9b544e 590static void addReply(redisClient *c, robj *obj);
591static void addReplySds(redisClient *c, sds s);
592static void incrRefCount(robj *o);
f78fd11b 593static int rdbSaveBackground(char *filename);
ed9b544e 594static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 595static robj *dupStringObject(robj *o);
248ea310 596static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 597static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 598static void flushAppendOnlyFile(void);
44b38ef4 599static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 600static int syncWithMaster(void);
05df7621 601static robj *tryObjectEncoding(robj *o);
9d65a1bb 602static robj *getDecodedObject(robj *o);
3305306f 603static int removeExpire(redisDb *db, robj *key);
604static int expireIfNeeded(redisDb *db, robj *key);
605static int deleteIfVolatile(redisDb *db, robj *key);
09241813 606static int dbDelete(redisDb *db, robj *key);
bb32ede5 607static time_t getExpire(redisDb *db, robj *key);
608static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 609static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 610static void freeMemoryIfNeeded(void);
de96dbfe 611static int processCommand(redisClient *c);
56906eef 612static void setupSigSegvAction(void);
a3b21203 613static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 614static void aofRemoveTempFile(pid_t childpid);
0ea663ea 615static size_t stringObjectLen(robj *o);
638e42ac 616static void processInputBuffer(redisClient *c);
6b47e12e 617static zskiplist *zslCreate(void);
fd8ccf44 618static void zslFree(zskiplist *zsl);
2b59cfdf 619static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 620static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 621static void initClientMultiState(redisClient *c);
622static void freeClientMultiState(redisClient *c);
623static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 624static void unblockClientWaitingData(redisClient *c);
4409877e 625static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 626static void vmInit(void);
a35ddf12 627static void vmMarkPagesFree(off_t page, off_t count);
560db612 628static robj *vmLoadObject(robj *o);
629static robj *vmPreviewObject(robj *o);
a69a0c9c 630static int vmSwapOneObjectBlocking(void);
631static int vmSwapOneObjectThreaded(void);
7e69548d 632static int vmCanSwapOut(void);
a5819310 633static int tryFreeOneObjectFromFreelist(void);
996cb5f7 634static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
635static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
636static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 637static void lockThreadedIO(void);
638static void unlockThreadedIO(void);
639static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
640static void freeIOJob(iojob *j);
641static void queueIOJob(iojob *j);
a5819310 642static int vmWriteObjectOnSwap(robj *o, off_t page);
643static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 644static void waitEmptyIOJobsQueue(void);
645static void vmReopenSwapFile(void);
970e10bb 646static int vmFreePage(off_t page);
ca1788b5 647static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 648static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 649static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 650static int dontWaitForSwappedKey(redisClient *c, robj *key);
651static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
652static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
653static struct redisCommand *lookupCommand(char *name);
654static void call(redisClient *c, struct redisCommand *cmd);
655static void resetClient(redisClient *c);
ada386b2 656static void convertToRealHash(robj *o);
003f0840 657static void listTypeConvert(robj *o, int enc);
d0b58d53 658static void setTypeConvert(robj *o, int enc);
ffc6b7f8 659static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
660static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
661static void freePubsubPattern(void *p);
662static int listMatchPubsubPattern(void *a, void *b);
663static int compareStringObjects(robj *a, robj *b);
bf028098 664static int equalStringObjects(robj *a, robj *b);
befec3cd 665static void usage();
8f63ddca 666static int rewriteAppendOnlyFileBackground(void);
560db612 667static vmpointer *vmSwapObjectBlocking(robj *val);
fab43727 668static int prepareForShutdown();
37ab76c9 669static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 670static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 671static void unwatchAllKeys(redisClient *c);
ed9b544e 672
abcb223e 673static void authCommand(redisClient *c);
ed9b544e 674static void pingCommand(redisClient *c);
675static void echoCommand(redisClient *c);
676static void setCommand(redisClient *c);
677static void setnxCommand(redisClient *c);
526d00a5 678static void setexCommand(redisClient *c);
ed9b544e 679static void getCommand(redisClient *c);
680static void delCommand(redisClient *c);
681static void existsCommand(redisClient *c);
682static void incrCommand(redisClient *c);
683static void decrCommand(redisClient *c);
684static void incrbyCommand(redisClient *c);
685static void decrbyCommand(redisClient *c);
686static void selectCommand(redisClient *c);
687static void randomkeyCommand(redisClient *c);
688static void keysCommand(redisClient *c);
689static void dbsizeCommand(redisClient *c);
690static void lastsaveCommand(redisClient *c);
691static void saveCommand(redisClient *c);
692static void bgsaveCommand(redisClient *c);
9d65a1bb 693static void bgrewriteaofCommand(redisClient *c);
ed9b544e 694static void shutdownCommand(redisClient *c);
695static void moveCommand(redisClient *c);
696static void renameCommand(redisClient *c);
697static void renamenxCommand(redisClient *c);
698static void lpushCommand(redisClient *c);
699static void rpushCommand(redisClient *c);
700static void lpopCommand(redisClient *c);
701static void rpopCommand(redisClient *c);
702static void llenCommand(redisClient *c);
703static void lindexCommand(redisClient *c);
704static void lrangeCommand(redisClient *c);
705static void ltrimCommand(redisClient *c);
706static void typeCommand(redisClient *c);
707static void lsetCommand(redisClient *c);
708static void saddCommand(redisClient *c);
709static void sremCommand(redisClient *c);
a4460ef4 710static void smoveCommand(redisClient *c);
ed9b544e 711static void sismemberCommand(redisClient *c);
712static void scardCommand(redisClient *c);
12fea928 713static void spopCommand(redisClient *c);
2abb95a9 714static void srandmemberCommand(redisClient *c);
ed9b544e 715static void sinterCommand(redisClient *c);
716static void sinterstoreCommand(redisClient *c);
40d224a9 717static void sunionCommand(redisClient *c);
718static void sunionstoreCommand(redisClient *c);
f4f56e1d 719static void sdiffCommand(redisClient *c);
720static void sdiffstoreCommand(redisClient *c);
ed9b544e 721static void syncCommand(redisClient *c);
722static void flushdbCommand(redisClient *c);
723static void flushallCommand(redisClient *c);
724static void sortCommand(redisClient *c);
725static void lremCommand(redisClient *c);
0f5f7e9a 726static void rpoplpushcommand(redisClient *c);
ed9b544e 727static void infoCommand(redisClient *c);
70003d28 728static void mgetCommand(redisClient *c);
87eca727 729static void monitorCommand(redisClient *c);
3305306f 730static void expireCommand(redisClient *c);
802e8373 731static void expireatCommand(redisClient *c);
f6b141c5 732static void getsetCommand(redisClient *c);
fd88489a 733static void ttlCommand(redisClient *c);
321b0e13 734static void slaveofCommand(redisClient *c);
7f957c92 735static void debugCommand(redisClient *c);
f6b141c5 736static void msetCommand(redisClient *c);
737static void msetnxCommand(redisClient *c);
fd8ccf44 738static void zaddCommand(redisClient *c);
7db723ad 739static void zincrbyCommand(redisClient *c);
cc812361 740static void zrangeCommand(redisClient *c);
50c55df5 741static void zrangebyscoreCommand(redisClient *c);
f44dd428 742static void zcountCommand(redisClient *c);
e3870fab 743static void zrevrangeCommand(redisClient *c);
3c41331e 744static void zcardCommand(redisClient *c);
1b7106e7 745static void zremCommand(redisClient *c);
6e333bbe 746static void zscoreCommand(redisClient *c);
1807985b 747static void zremrangebyscoreCommand(redisClient *c);
6e469882 748static void multiCommand(redisClient *c);
749static void execCommand(redisClient *c);
18b6cb76 750static void discardCommand(redisClient *c);
4409877e 751static void blpopCommand(redisClient *c);
752static void brpopCommand(redisClient *c);
4b00bebd 753static void appendCommand(redisClient *c);
39191553 754static void substrCommand(redisClient *c);
69d95c3e 755static void zrankCommand(redisClient *c);
798d9e55 756static void zrevrankCommand(redisClient *c);
978c2c94 757static void hsetCommand(redisClient *c);
1f1c7695 758static void hsetnxCommand(redisClient *c);
978c2c94 759static void hgetCommand(redisClient *c);
09aeb579
PN
760static void hmsetCommand(redisClient *c);
761static void hmgetCommand(redisClient *c);
07efaf74 762static void hdelCommand(redisClient *c);
92b27fe9 763static void hlenCommand(redisClient *c);
9212eafd 764static void zremrangebyrankCommand(redisClient *c);
5d373da9 765static void zunionstoreCommand(redisClient *c);
766static void zinterstoreCommand(redisClient *c);
78409a0f 767static void hkeysCommand(redisClient *c);
768static void hvalsCommand(redisClient *c);
769static void hgetallCommand(redisClient *c);
a86f14b1 770static void hexistsCommand(redisClient *c);
500ece7c 771static void configCommand(redisClient *c);
01426b05 772static void hincrbyCommand(redisClient *c);
befec3cd 773static void subscribeCommand(redisClient *c);
774static void unsubscribeCommand(redisClient *c);
ffc6b7f8 775static void psubscribeCommand(redisClient *c);
776static void punsubscribeCommand(redisClient *c);
befec3cd 777static void publishCommand(redisClient *c);
37ab76c9 778static void watchCommand(redisClient *c);
779static void unwatchCommand(redisClient *c);
f6b141c5 780
ed9b544e 781/*================================= Globals ================================= */
782
783/* Global vars */
784static struct redisServer server; /* server global state */
1a132bbc 785static struct redisCommand *commandTable;
1a132bbc 786static struct redisCommand readonlyCommandTable[] = {
76583ea4
PN
787 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
789 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 790 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
791 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
798 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
810 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
811 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
813 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
814 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
815 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
819 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
820 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
821 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
822 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
823 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
824 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
827 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
828 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 830 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
831 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
832 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
838 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
839 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
840 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 841 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 842 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 843 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 844 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 845 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
846 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
847 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
848 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 851 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
852 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
853 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
854 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
855 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
856 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
857 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
860 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
861 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
862 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
869 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
870 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
872 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
875 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 876 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
877 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
878 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
880 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
882 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
883 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
885 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
886 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 887 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 888 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
889 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 890 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
891 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 892 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 893 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
d55d5c5d 894 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
ed9b544e 895};
bcfc686d 896
ed9b544e 897/*============================ Utility functions ============================ */
898
899/* Glob-style pattern matching. */
500ece7c 900static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 901 const char *string, int stringLen, int nocase)
902{
903 while(patternLen) {
904 switch(pattern[0]) {
905 case '*':
906 while (pattern[1] == '*') {
907 pattern++;
908 patternLen--;
909 }
910 if (patternLen == 1)
911 return 1; /* match */
912 while(stringLen) {
913 if (stringmatchlen(pattern+1, patternLen-1,
914 string, stringLen, nocase))
915 return 1; /* match */
916 string++;
917 stringLen--;
918 }
919 return 0; /* no match */
920 break;
921 case '?':
922 if (stringLen == 0)
923 return 0; /* no match */
924 string++;
925 stringLen--;
926 break;
927 case '[':
928 {
929 int not, match;
930
931 pattern++;
932 patternLen--;
933 not = pattern[0] == '^';
934 if (not) {
935 pattern++;
936 patternLen--;
937 }
938 match = 0;
939 while(1) {
940 if (pattern[0] == '\\') {
941 pattern++;
942 patternLen--;
943 if (pattern[0] == string[0])
944 match = 1;
945 } else if (pattern[0] == ']') {
946 break;
947 } else if (patternLen == 0) {
948 pattern--;
949 patternLen++;
950 break;
951 } else if (pattern[1] == '-' && patternLen >= 3) {
952 int start = pattern[0];
953 int end = pattern[2];
954 int c = string[0];
955 if (start > end) {
956 int t = start;
957 start = end;
958 end = t;
959 }
960 if (nocase) {
961 start = tolower(start);
962 end = tolower(end);
963 c = tolower(c);
964 }
965 pattern += 2;
966 patternLen -= 2;
967 if (c >= start && c <= end)
968 match = 1;
969 } else {
970 if (!nocase) {
971 if (pattern[0] == string[0])
972 match = 1;
973 } else {
974 if (tolower((int)pattern[0]) == tolower((int)string[0]))
975 match = 1;
976 }
977 }
978 pattern++;
979 patternLen--;
980 }
981 if (not)
982 match = !match;
983 if (!match)
984 return 0; /* no match */
985 string++;
986 stringLen--;
987 break;
988 }
989 case '\\':
990 if (patternLen >= 2) {
991 pattern++;
992 patternLen--;
993 }
994 /* fall through */
995 default:
996 if (!nocase) {
997 if (pattern[0] != string[0])
998 return 0; /* no match */
999 } else {
1000 if (tolower((int)pattern[0]) != tolower((int)string[0]))
1001 return 0; /* no match */
1002 }
1003 string++;
1004 stringLen--;
1005 break;
1006 }
1007 pattern++;
1008 patternLen--;
1009 if (stringLen == 0) {
1010 while(*pattern == '*') {
1011 pattern++;
1012 patternLen--;
1013 }
1014 break;
1015 }
1016 }
1017 if (patternLen == 0 && stringLen == 0)
1018 return 1;
1019 return 0;
1020}
1021
500ece7c 1022static int stringmatch(const char *pattern, const char *string, int nocase) {
1023 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1024}
1025
2b619329 1026/* Convert a string representing an amount of memory into the number of
1027 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1028 * (1024*1024*1024).
1029 *
1030 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1031 * set to 0 */
1032static long long memtoll(const char *p, int *err) {
1033 const char *u;
1034 char buf[128];
1035 long mul; /* unit multiplier */
1036 long long val;
1037 unsigned int digits;
1038
1039 if (err) *err = 0;
1040 /* Search the first non digit character. */
1041 u = p;
1042 if (*u == '-') u++;
1043 while(*u && isdigit(*u)) u++;
1044 if (*u == '\0' || !strcasecmp(u,"b")) {
1045 mul = 1;
72324005 1046 } else if (!strcasecmp(u,"k")) {
2b619329 1047 mul = 1000;
72324005 1048 } else if (!strcasecmp(u,"kb")) {
2b619329 1049 mul = 1024;
72324005 1050 } else if (!strcasecmp(u,"m")) {
2b619329 1051 mul = 1000*1000;
72324005 1052 } else if (!strcasecmp(u,"mb")) {
2b619329 1053 mul = 1024*1024;
72324005 1054 } else if (!strcasecmp(u,"g")) {
2b619329 1055 mul = 1000L*1000*1000;
72324005 1056 } else if (!strcasecmp(u,"gb")) {
2b619329 1057 mul = 1024L*1024*1024;
1058 } else {
1059 if (err) *err = 1;
1060 mul = 1;
1061 }
1062 digits = u-p;
1063 if (digits >= sizeof(buf)) {
1064 if (err) *err = 1;
1065 return LLONG_MAX;
1066 }
1067 memcpy(buf,p,digits);
1068 buf[digits] = '\0';
1069 val = strtoll(buf,NULL,10);
1070 return val*mul;
1071}
1072
ee14da56 1073/* Convert a long long into a string. Returns the number of
1074 * characters needed to represent the number, that can be shorter if passed
1075 * buffer length is not enough to store the whole number. */
1076static int ll2string(char *s, size_t len, long long value) {
1077 char buf[32], *p;
1078 unsigned long long v;
1079 size_t l;
1080
1081 if (len == 0) return 0;
1082 v = (value < 0) ? -value : value;
1083 p = buf+31; /* point to the last character */
1084 do {
1085 *p-- = '0'+(v%10);
1086 v /= 10;
1087 } while(v);
1088 if (value < 0) *p-- = '-';
1089 p++;
1090 l = 32-(p-buf);
1091 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1092 memcpy(s,p,l);
1093 s[l] = '\0';
1094 return l;
1095}
1096
56906eef 1097static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1098 va_list ap;
1099 FILE *fp;
1100
1101 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1102 if (!fp) return;
1103
1104 va_start(ap, fmt);
1105 if (level >= server.verbosity) {
6766f45e 1106 char *c = ".-*#";
1904ecc1 1107 char buf[64];
1108 time_t now;
1109
1110 now = time(NULL);
6c9385e0 1111 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1112 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1113 vfprintf(fp, fmt, ap);
1114 fprintf(fp,"\n");
1115 fflush(fp);
1116 }
1117 va_end(ap);
1118
1119 if (server.logfile) fclose(fp);
1120}
1121
1122/*====================== Hash table type implementation ==================== */
1123
1124/* This is an hash table type that uses the SDS dynamic strings libary as
1125 * keys and radis objects as values (objects can hold SDS strings,
1126 * lists, sets). */
1127
1812e024 1128static void dictVanillaFree(void *privdata, void *val)
1129{
1130 DICT_NOTUSED(privdata);
1131 zfree(val);
1132}
1133
4409877e 1134static void dictListDestructor(void *privdata, void *val)
1135{
1136 DICT_NOTUSED(privdata);
1137 listRelease((list*)val);
1138}
1139
09241813 1140static int dictSdsKeyCompare(void *privdata, const void *key1,
ed9b544e 1141 const void *key2)
1142{
1143 int l1,l2;
1144 DICT_NOTUSED(privdata);
1145
1146 l1 = sdslen((sds)key1);
1147 l2 = sdslen((sds)key2);
1148 if (l1 != l2) return 0;
1149 return memcmp(key1, key2, l1) == 0;
1150}
1151
1152static void dictRedisObjectDestructor(void *privdata, void *val)
1153{
1154 DICT_NOTUSED(privdata);
1155
a35ddf12 1156 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1157 decrRefCount(val);
1158}
1159
09241813 1160static void dictSdsDestructor(void *privdata, void *val)
1161{
1162 DICT_NOTUSED(privdata);
1163
1164 sdsfree(val);
1165}
1166
942a3961 1167static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1168 const void *key2)
1169{
1170 const robj *o1 = key1, *o2 = key2;
09241813 1171 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
ed9b544e 1172}
1173
942a3961 1174static unsigned int dictObjHash(const void *key) {
ed9b544e 1175 const robj *o = key;
1176 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1177}
1178
09241813 1179static unsigned int dictSdsHash(const void *key) {
1180 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1181}
1182
942a3961 1183static int dictEncObjKeyCompare(void *privdata, const void *key1,
1184 const void *key2)
1185{
9d65a1bb 1186 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1187 int cmp;
942a3961 1188
2a1198b4 1189 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1190 o2->encoding == REDIS_ENCODING_INT)
1191 return o1->ptr == o2->ptr;
2a1198b4 1192
9d65a1bb 1193 o1 = getDecodedObject(o1);
1194 o2 = getDecodedObject(o2);
09241813 1195 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
9d65a1bb 1196 decrRefCount(o1);
1197 decrRefCount(o2);
1198 return cmp;
942a3961 1199}
1200
1201static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1202 robj *o = (robj*) key;
942a3961 1203
ed9e4966 1204 if (o->encoding == REDIS_ENCODING_RAW) {
1205 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1206 } else {
1207 if (o->encoding == REDIS_ENCODING_INT) {
1208 char buf[32];
1209 int len;
1210
ee14da56 1211 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1212 return dictGenHashFunction((unsigned char*)buf, len);
1213 } else {
1214 unsigned int hash;
1215
1216 o = getDecodedObject(o);
1217 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1218 decrRefCount(o);
1219 return hash;
1220 }
1221 }
942a3961 1222}
1223
09241813 1224/* Sets type */
ed9b544e 1225static dictType setDictType = {
942a3961 1226 dictEncObjHash, /* hash function */
ed9b544e 1227 NULL, /* key dup */
1228 NULL, /* val dup */
942a3961 1229 dictEncObjKeyCompare, /* key compare */
ed9b544e 1230 dictRedisObjectDestructor, /* key destructor */
1231 NULL /* val destructor */
1232};
1233
f2d9f50f 1234/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1235static dictType zsetDictType = {
1236 dictEncObjHash, /* hash function */
1237 NULL, /* key dup */
1238 NULL, /* val dup */
1239 dictEncObjKeyCompare, /* key compare */
1240 dictRedisObjectDestructor, /* key destructor */
da0a1620 1241 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1242};
1243
09241813 1244/* Db->dict, keys are sds strings, vals are Redis objects. */
5234952b 1245static dictType dbDictType = {
09241813 1246 dictSdsHash, /* hash function */
ed9b544e 1247 NULL, /* key dup */
1248 NULL, /* val dup */
09241813 1249 dictSdsKeyCompare, /* key compare */
1250 dictSdsDestructor, /* key destructor */
ed9b544e 1251 dictRedisObjectDestructor /* val destructor */
1252};
1253
f2d9f50f 1254/* Db->expires */
1255static dictType keyptrDictType = {
09241813 1256 dictSdsHash, /* hash function */
f2d9f50f 1257 NULL, /* key dup */
1258 NULL, /* val dup */
09241813 1259 dictSdsKeyCompare, /* key compare */
1260 dictSdsDestructor, /* key destructor */
f2d9f50f 1261 NULL /* val destructor */
1262};
1263
5234952b 1264/* Hash type hash table (note that small hashes are represented with zimpaps) */
1265static dictType hashDictType = {
1266 dictEncObjHash, /* hash function */
1267 NULL, /* key dup */
1268 NULL, /* val dup */
1269 dictEncObjKeyCompare, /* key compare */
1270 dictRedisObjectDestructor, /* key destructor */
1271 dictRedisObjectDestructor /* val destructor */
1272};
1273
4409877e 1274/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1275 * lists as values. It's used for blocking operations (BLPOP) and to
1276 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1277static dictType keylistDictType = {
1278 dictObjHash, /* hash function */
1279 NULL, /* key dup */
1280 NULL, /* val dup */
1281 dictObjKeyCompare, /* key compare */
1282 dictRedisObjectDestructor, /* key destructor */
1283 dictListDestructor /* val destructor */
1284};
1285
42ab0172
AO
1286static void version();
1287
ed9b544e 1288/* ========================= Random utility functions ======================= */
1289
1290/* Redis generally does not try to recover from out of memory conditions
1291 * when allocating objects or strings, it is not clear if it will be possible
1292 * to report this condition to the client since the networking layer itself
1293 * is based on heap allocation for send buffers, so we simply abort.
1294 * At least the code will be simpler to read... */
1295static void oom(const char *msg) {
71c54b21 1296 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1297 sleep(1);
1298 abort();
1299}
1300
1301/* ====================== Redis server networking stuff ===================== */
56906eef 1302static void closeTimedoutClients(void) {
ed9b544e 1303 redisClient *c;
ed9b544e 1304 listNode *ln;
1305 time_t now = time(NULL);
c7df85a4 1306 listIter li;
ed9b544e 1307
c7df85a4 1308 listRewind(server.clients,&li);
1309 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1310 c = listNodeValue(ln);
f86a74e9 1311 if (server.maxidletime &&
1312 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1313 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1314 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1315 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1316 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1317 {
f870935d 1318 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1319 freeClient(c);
f86a74e9 1320 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1321 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1322 addReply(c,shared.nullmultibulk);
b0d8747d 1323 unblockClientWaitingData(c);
f86a74e9 1324 }
ed9b544e 1325 }
1326 }
ed9b544e 1327}
1328
12fea928 1329static int htNeedsResize(dict *dict) {
1330 long long size, used;
1331
1332 size = dictSlots(dict);
1333 used = dictSize(dict);
1334 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1335 (used*100/size < REDIS_HT_MINFILL));
1336}
1337
0bc03378 1338/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1339 * we resize the hash table to save memory */
56906eef 1340static void tryResizeHashTables(void) {
0bc03378 1341 int j;
1342
1343 for (j = 0; j < server.dbnum; j++) {
5413c40d 1344 if (htNeedsResize(server.db[j].dict))
0bc03378 1345 dictResize(server.db[j].dict);
12fea928 1346 if (htNeedsResize(server.db[j].expires))
1347 dictResize(server.db[j].expires);
0bc03378 1348 }
1349}
1350
8ca3e9d1 1351/* Our hash table implementation performs rehashing incrementally while
1352 * we write/read from the hash table. Still if the server is idle, the hash
1353 * table will use two tables for a long time. So we try to use 1 millisecond
1354 * of CPU time at every serverCron() loop in order to rehash some key. */
1355static void incrementallyRehash(void) {
1356 int j;
1357
1358 for (j = 0; j < server.dbnum; j++) {
1359 if (dictIsRehashing(server.db[j].dict)) {
1360 dictRehashMilliseconds(server.db[j].dict,1);
1361 break; /* already used our millisecond for this loop... */
1362 }
1363 }
1364}
1365
9d65a1bb 1366/* A background saving child (BGSAVE) terminated its work. Handle this. */
1367void backgroundSaveDoneHandler(int statloc) {
1368 int exitcode = WEXITSTATUS(statloc);
1369 int bysignal = WIFSIGNALED(statloc);
1370
1371 if (!bysignal && exitcode == 0) {
1372 redisLog(REDIS_NOTICE,
1373 "Background saving terminated with success");
1374 server.dirty = 0;
1375 server.lastsave = time(NULL);
1376 } else if (!bysignal && exitcode != 0) {
1377 redisLog(REDIS_WARNING, "Background saving error");
1378 } else {
1379 redisLog(REDIS_WARNING,
454eea7c 1380 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1381 rdbRemoveTempFile(server.bgsavechildpid);
1382 }
1383 server.bgsavechildpid = -1;
1384 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1385 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1386 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1387}
1388
1389/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1390 * Handle this. */
1391void backgroundRewriteDoneHandler(int statloc) {
1392 int exitcode = WEXITSTATUS(statloc);
1393 int bysignal = WIFSIGNALED(statloc);
1394
1395 if (!bysignal && exitcode == 0) {
1396 int fd;
1397 char tmpfile[256];
1398
1399 redisLog(REDIS_NOTICE,
1400 "Background append only file rewriting terminated with success");
1401 /* Now it's time to flush the differences accumulated by the parent */
1402 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1403 fd = open(tmpfile,O_WRONLY|O_APPEND);
1404 if (fd == -1) {
1405 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1406 goto cleanup;
1407 }
1408 /* Flush our data... */
1409 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1410 (signed) sdslen(server.bgrewritebuf)) {
1411 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1412 close(fd);
1413 goto cleanup;
1414 }
b32627cd 1415 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1416 /* Now our work is to rename the temp file into the stable file. And
1417 * switch the file descriptor used by the server for append only. */
1418 if (rename(tmpfile,server.appendfilename) == -1) {
1419 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1420 close(fd);
1421 goto cleanup;
1422 }
1423 /* Mission completed... almost */
1424 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1425 if (server.appendfd != -1) {
1426 /* If append only is actually enabled... */
1427 close(server.appendfd);
1428 server.appendfd = fd;
d5d23dab 1429 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
85a83172 1430 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1431 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1432 } else {
1433 /* If append only is disabled we just generate a dump in this
1434 * format. Why not? */
1435 close(fd);
1436 }
1437 } else if (!bysignal && exitcode != 0) {
1438 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1439 } else {
1440 redisLog(REDIS_WARNING,
454eea7c 1441 "Background append only file rewriting terminated by signal %d",
1442 WTERMSIG(statloc));
9d65a1bb 1443 }
1444cleanup:
1445 sdsfree(server.bgrewritebuf);
1446 server.bgrewritebuf = sdsempty();
1447 aofRemoveTempFile(server.bgrewritechildpid);
1448 server.bgrewritechildpid = -1;
1449}
1450
884d4b39 1451/* This function is called once a background process of some kind terminates,
1452 * as we want to avoid resizing the hash tables when there is a child in order
1453 * to play well with copy-on-write (otherwise when a resize happens lots of
1454 * memory pages are copied). The goal of this function is to update the ability
1455 * for dict.c to resize the hash tables accordingly to the fact we have o not
1456 * running childs. */
1457static void updateDictResizePolicy(void) {
1458 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1459 dictEnableResize();
1460 else
1461 dictDisableResize();
1462}
1463
56906eef 1464static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1465 int j, loops = server.cronloops++;
ed9b544e 1466 REDIS_NOTUSED(eventLoop);
1467 REDIS_NOTUSED(id);
1468 REDIS_NOTUSED(clientData);
1469
3a66edc7 1470 /* We take a cached value of the unix time in the global state because
1471 * with virtual memory and aging there is to store the current time
1472 * in objects at every object access, and accuracy is not needed.
1473 * To access a global var is faster than calling time(NULL) */
1474 server.unixtime = time(NULL);
560db612 1475 /* We have just 21 bits per object for LRU information.
1476 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1477 *
1478 * When we need to select what object to swap, we compute the minimum
1479 * time distance between the current lruclock and the object last access
1480 * lruclock info. Even if clocks will wrap on overflow, there is
1481 * the interesting property that we are sure that at least
1482 * ABS(A-B) minutes passed between current time and timestamp B.
1483 *
1484 * This is not precise but we don't need at all precision, but just
1485 * something statistically reasonable.
1486 */
1487 server.lruclock = (time(NULL)/60)&((1<<21)-1);
3a66edc7 1488
fab43727 1489 /* We received a SIGTERM, shutting down here in a safe way, as it is
1490 * not ok doing so inside the signal handler. */
1491 if (server.shutdown_asap) {
1492 if (prepareForShutdown() == REDIS_OK) exit(0);
1493 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1494 }
1495
0bc03378 1496 /* Show some info about non-empty databases */
ed9b544e 1497 for (j = 0; j < server.dbnum; j++) {
dec423d9 1498 long long size, used, vkeys;
94754ccc 1499
3305306f 1500 size = dictSlots(server.db[j].dict);
1501 used = dictSize(server.db[j].dict);
94754ccc 1502 vkeys = dictSize(server.db[j].expires);
1763929f 1503 if (!(loops % 50) && (used || vkeys)) {
f870935d 1504 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1505 /* dictPrintStats(server.dict); */
ed9b544e 1506 }
ed9b544e 1507 }
1508
0bc03378 1509 /* We don't want to resize the hash tables while a bacground saving
1510 * is in progress: the saving child is created using fork() that is
1511 * implemented with a copy-on-write semantic in most modern systems, so
1512 * if we resize the HT while there is the saving child at work actually
1513 * a lot of memory movements in the parent will cause a lot of pages
1514 * copied. */
8ca3e9d1 1515 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1516 if (!(loops % 10)) tryResizeHashTables();
1517 if (server.activerehashing) incrementallyRehash();
884d4b39 1518 }
0bc03378 1519
ed9b544e 1520 /* Show information about connected clients */
1763929f 1521 if (!(loops % 50)) {
bdcb92f2 1522 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1523 listLength(server.clients)-listLength(server.slaves),
1524 listLength(server.slaves),
bdcb92f2 1525 zmalloc_used_memory());
ed9b544e 1526 }
1527
1528 /* Close connections of timedout clients */
1763929f 1529 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1530 closeTimedoutClients();
1531
9d65a1bb 1532 /* Check if a background saving or AOF rewrite in progress terminated */
1533 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1534 int statloc;
9d65a1bb 1535 pid_t pid;
1536
1537 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1538 if (pid == server.bgsavechildpid) {
1539 backgroundSaveDoneHandler(statloc);
ed9b544e 1540 } else {
9d65a1bb 1541 backgroundRewriteDoneHandler(statloc);
ed9b544e 1542 }
884d4b39 1543 updateDictResizePolicy();
ed9b544e 1544 }
1545 } else {
1546 /* If there is not a background saving in progress check if
1547 * we have to save now */
1548 time_t now = time(NULL);
1549 for (j = 0; j < server.saveparamslen; j++) {
1550 struct saveparam *sp = server.saveparams+j;
1551
1552 if (server.dirty >= sp->changes &&
1553 now-server.lastsave > sp->seconds) {
1554 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1555 sp->changes, sp->seconds);
f78fd11b 1556 rdbSaveBackground(server.dbfilename);
ed9b544e 1557 break;
1558 }
1559 }
1560 }
94754ccc 1561
f2324293 1562 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1563 * will use few CPU cycles if there are few expiring keys, otherwise
1564 * it will get more aggressive to avoid that too much memory is used by
1565 * keys that can be removed from the keyspace. */
94754ccc 1566 for (j = 0; j < server.dbnum; j++) {
f2324293 1567 int expired;
94754ccc 1568 redisDb *db = server.db+j;
94754ccc 1569
f2324293 1570 /* Continue to expire if at the end of the cycle more than 25%
1571 * of the keys were expired. */
1572 do {
4ef8de8a 1573 long num = dictSize(db->expires);
94754ccc 1574 time_t now = time(NULL);
1575
f2324293 1576 expired = 0;
94754ccc 1577 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1578 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1579 while (num--) {
1580 dictEntry *de;
1581 time_t t;
1582
1583 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1584 t = (time_t) dictGetEntryVal(de);
1585 if (now > t) {
09241813 1586 sds key = dictGetEntryKey(de);
1587 robj *keyobj = createStringObject(key,sdslen(key));
1588
1589 dbDelete(db,keyobj);
1590 decrRefCount(keyobj);
f2324293 1591 expired++;
2a6a2ed1 1592 server.stat_expiredkeys++;
94754ccc 1593 }
1594 }
f2324293 1595 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1596 }
1597
4ef8de8a 1598 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1599 * is enbled. Try to free objects from the free list first. */
7e69548d 1600 if (vmCanSwapOut()) {
1601 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1602 server.vm_max_memory)
1603 {
72e9fd40 1604 int retval;
1605
a5819310 1606 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1607 retval = (server.vm_max_threads == 0) ?
1608 vmSwapOneObjectBlocking() :
1609 vmSwapOneObjectThreaded();
1763929f 1610 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1611 zmalloc_used_memory() >
1612 (server.vm_max_memory+server.vm_max_memory/10))
1613 {
1614 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1615 }
72e9fd40 1616 /* Note that when using threade I/O we free just one object,
1617 * because anyway when the I/O thread in charge to swap this
1618 * object out will finish, the handler of completed jobs
1619 * will try to swap more objects if we are still out of memory. */
1620 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1621 }
1622 }
1623
ed9b544e 1624 /* Check if we should connect to a MASTER */
1763929f 1625 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1626 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1627 if (syncWithMaster() == REDIS_OK) {
1628 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1629 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1630 }
1631 }
1763929f 1632 return 100;
ed9b544e 1633}
1634
d5d55fc3 1635/* This function gets called every time Redis is entering the
1636 * main loop of the event driven library, that is, before to sleep
1637 * for ready file descriptors. */
1638static void beforeSleep(struct aeEventLoop *eventLoop) {
1639 REDIS_NOTUSED(eventLoop);
1640
28ed1f33 1641 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1642 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1643 listIter li;
1644 listNode *ln;
1645
1646 listRewind(server.io_ready_clients,&li);
1647 while((ln = listNext(&li))) {
1648 redisClient *c = ln->value;
1649 struct redisCommand *cmd;
1650
1651 /* Resume the client. */
1652 listDelNode(server.io_ready_clients,ln);
1653 c->flags &= (~REDIS_IO_WAIT);
1654 server.vm_blocked_clients--;
1655 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1656 readQueryFromClient, c);
1657 cmd = lookupCommand(c->argv[0]->ptr);
1658 assert(cmd != NULL);
1659 call(c,cmd);
1660 resetClient(c);
1661 /* There may be more data to process in the input buffer. */
1662 if (c->querybuf && sdslen(c->querybuf) > 0)
1663 processInputBuffer(c);
1664 }
1665 }
28ed1f33 1666 /* Write the AOF buffer on disk */
1667 flushAppendOnlyFile();
d5d55fc3 1668}
1669
ed9b544e 1670static void createSharedObjects(void) {
05df7621 1671 int j;
1672
ed9b544e 1673 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1674 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1675 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1676 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1677 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1678 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1679 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1680 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1681 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1682 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1683 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1684 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1685 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1686 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1687 "-ERR no such key\r\n"));
ed9b544e 1688 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1689 "-ERR syntax error\r\n"));
c937aa89 1690 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1691 "-ERR source and destination objects are the same\r\n"));
1692 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1693 "-ERR index out of range\r\n"));
ed9b544e 1694 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1695 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1696 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1697 shared.select0 = createStringObject("select 0\r\n",10);
1698 shared.select1 = createStringObject("select 1\r\n",10);
1699 shared.select2 = createStringObject("select 2\r\n",10);
1700 shared.select3 = createStringObject("select 3\r\n",10);
1701 shared.select4 = createStringObject("select 4\r\n",10);
1702 shared.select5 = createStringObject("select 5\r\n",10);
1703 shared.select6 = createStringObject("select 6\r\n",10);
1704 shared.select7 = createStringObject("select 7\r\n",10);
1705 shared.select8 = createStringObject("select 8\r\n",10);
1706 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1707 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1708 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1709 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1710 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1711 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1712 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1713 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1714 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1715 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1716 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1717 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1718 }
ed9b544e 1719}
1720
1721static void appendServerSaveParams(time_t seconds, int changes) {
1722 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1723 server.saveparams[server.saveparamslen].seconds = seconds;
1724 server.saveparams[server.saveparamslen].changes = changes;
1725 server.saveparamslen++;
1726}
1727
bcfc686d 1728static void resetServerSaveParams() {
ed9b544e 1729 zfree(server.saveparams);
1730 server.saveparams = NULL;
1731 server.saveparamslen = 0;
1732}
1733
1734static void initServerConfig() {
1735 server.dbnum = REDIS_DEFAULT_DBNUM;
1736 server.port = REDIS_SERVERPORT;
f870935d 1737 server.verbosity = REDIS_VERBOSE;
ed9b544e 1738 server.maxidletime = REDIS_MAXIDLETIME;
1739 server.saveparams = NULL;
1740 server.logfile = NULL; /* NULL = log on standard output */
1741 server.bindaddr = NULL;
1742 server.glueoutputbuf = 1;
1743 server.daemonize = 0;
44b38ef4 1744 server.appendonly = 0;
1b677732 1745 server.appendfsync = APPENDFSYNC_EVERYSEC;
38db9171 1746 server.no_appendfsync_on_rewrite = 0;
48f0308a 1747 server.lastfsync = time(NULL);
44b38ef4 1748 server.appendfd = -1;
1749 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1750 server.pidfile = zstrdup("/var/run/redis.pid");
1751 server.dbfilename = zstrdup("dump.rdb");
1752 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1753 server.requirepass = NULL;
b0553789 1754 server.rdbcompression = 1;
8ca3e9d1 1755 server.activerehashing = 1;
285add55 1756 server.maxclients = 0;
d5d55fc3 1757 server.blpop_blocked_clients = 0;
3fd78bcd 1758 server.maxmemory = 0;
75680a3c 1759 server.vm_enabled = 0;
054e426d 1760 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1761 server.vm_page_size = 256; /* 256 bytes per page */
1762 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1763 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1764 server.vm_max_threads = 4;
d5d55fc3 1765 server.vm_blocked_clients = 0;
cbba7dd7 1766 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1767 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
d0686e07
PN
1768 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1769 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
70ff3511 1770 server.set_max_intset_entries = REDIS_SET_MAX_INTSET_ENTRIES;
fab43727 1771 server.shutdown_asap = 0;
75680a3c 1772
bcfc686d 1773 resetServerSaveParams();
ed9b544e 1774
1775 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1776 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1777 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1778 /* Replication related */
1779 server.isslave = 0;
d0ccebcf 1780 server.masterauth = NULL;
ed9b544e 1781 server.masterhost = NULL;
1782 server.masterport = 6379;
1783 server.master = NULL;
1784 server.replstate = REDIS_REPL_NONE;
a7866db6 1785
1786 /* Double constants initialization */
1787 R_Zero = 0.0;
1788 R_PosInf = 1.0/R_Zero;
1789 R_NegInf = -1.0/R_Zero;
1790 R_Nan = R_Zero/R_Zero;
ed9b544e 1791}
1792
1793static void initServer() {
1794 int j;
1795
1796 signal(SIGHUP, SIG_IGN);
1797 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1798 setupSigSegvAction();
ed9b544e 1799
b9bc0eef 1800 server.devnull = fopen("/dev/null","w");
1801 if (server.devnull == NULL) {
1802 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1803 exit(1);
1804 }
ed9b544e 1805 server.clients = listCreate();
1806 server.slaves = listCreate();
87eca727 1807 server.monitors = listCreate();
ed9b544e 1808 server.objfreelist = listCreate();
1809 createSharedObjects();
1810 server.el = aeCreateEventLoop();
3305306f 1811 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1812 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1813 if (server.fd == -1) {
1814 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1815 exit(1);
1816 }
3305306f 1817 for (j = 0; j < server.dbnum; j++) {
5234952b 1818 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1819 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1820 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1821 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1822 if (server.vm_enabled)
1823 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1824 server.db[j].id = j;
1825 }
ffc6b7f8 1826 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1827 server.pubsub_patterns = listCreate();
1828 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1829 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1830 server.cronloops = 0;
9f3c422c 1831 server.bgsavechildpid = -1;
9d65a1bb 1832 server.bgrewritechildpid = -1;
1833 server.bgrewritebuf = sdsempty();
28ed1f33 1834 server.aofbuf = sdsempty();
ed9b544e 1835 server.lastsave = time(NULL);
1836 server.dirty = 0;
ed9b544e 1837 server.stat_numcommands = 0;
1838 server.stat_numconnections = 0;
2a6a2ed1 1839 server.stat_expiredkeys = 0;
ed9b544e 1840 server.stat_starttime = time(NULL);
3a66edc7 1841 server.unixtime = time(NULL);
d8f8b666 1842 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1843 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1844 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1845
1846 if (server.appendonly) {
3bb225d6 1847 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1848 if (server.appendfd == -1) {
1849 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1850 strerror(errno));
1851 exit(1);
1852 }
1853 }
75680a3c 1854
1855 if (server.vm_enabled) vmInit();
ed9b544e 1856}
1857
1858/* Empty the whole database */
ca37e9cd 1859static long long emptyDb() {
ed9b544e 1860 int j;
ca37e9cd 1861 long long removed = 0;
ed9b544e 1862
3305306f 1863 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1864 removed += dictSize(server.db[j].dict);
3305306f 1865 dictEmpty(server.db[j].dict);
1866 dictEmpty(server.db[j].expires);
1867 }
ca37e9cd 1868 return removed;
ed9b544e 1869}
1870
85dd2f3a 1871static int yesnotoi(char *s) {
1872 if (!strcasecmp(s,"yes")) return 1;
1873 else if (!strcasecmp(s,"no")) return 0;
1874 else return -1;
1875}
1876
ed9b544e 1877/* I agree, this is a very rudimental way to load a configuration...
1878 will improve later if the config gets more complex */
1879static void loadServerConfig(char *filename) {
c9a111ac 1880 FILE *fp;
ed9b544e 1881 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1882 int linenum = 0;
1883 sds line = NULL;
c9a111ac 1884
1885 if (filename[0] == '-' && filename[1] == '\0')
1886 fp = stdin;
1887 else {
1888 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1889 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1890 exit(1);
1891 }
ed9b544e 1892 }
c9a111ac 1893
ed9b544e 1894 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1895 sds *argv;
1896 int argc, j;
1897
1898 linenum++;
1899 line = sdsnew(buf);
1900 line = sdstrim(line," \t\r\n");
1901
1902 /* Skip comments and blank lines*/
1903 if (line[0] == '#' || line[0] == '\0') {
1904 sdsfree(line);
1905 continue;
1906 }
1907
1908 /* Split into arguments */
1909 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1910 sdstolower(argv[0]);
1911
1912 /* Execute config directives */
bb0b03a3 1913 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1914 server.maxidletime = atoi(argv[1]);
0150db36 1915 if (server.maxidletime < 0) {
ed9b544e 1916 err = "Invalid timeout value"; goto loaderr;
1917 }
bb0b03a3 1918 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1919 server.port = atoi(argv[1]);
1920 if (server.port < 1 || server.port > 65535) {
1921 err = "Invalid port"; goto loaderr;
1922 }
bb0b03a3 1923 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1924 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1925 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1926 int seconds = atoi(argv[1]);
1927 int changes = atoi(argv[2]);
1928 if (seconds < 1 || changes < 0) {
1929 err = "Invalid save parameters"; goto loaderr;
1930 }
1931 appendServerSaveParams(seconds,changes);
bb0b03a3 1932 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1933 if (chdir(argv[1]) == -1) {
1934 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1935 argv[1], strerror(errno));
1936 exit(1);
1937 }
bb0b03a3 1938 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1939 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1940 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1941 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1942 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1943 else {
1944 err = "Invalid log level. Must be one of debug, notice, warning";
1945 goto loaderr;
1946 }
bb0b03a3 1947 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1948 FILE *logfp;
ed9b544e 1949
1950 server.logfile = zstrdup(argv[1]);
bb0b03a3 1951 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1952 zfree(server.logfile);
1953 server.logfile = NULL;
1954 }
1955 if (server.logfile) {
1956 /* Test if we are able to open the file. The server will not
1957 * be able to abort just for this problem later... */
c9a111ac 1958 logfp = fopen(server.logfile,"a");
1959 if (logfp == NULL) {
ed9b544e 1960 err = sdscatprintf(sdsempty(),
1961 "Can't open the log file: %s", strerror(errno));
1962 goto loaderr;
1963 }
c9a111ac 1964 fclose(logfp);
ed9b544e 1965 }
bb0b03a3 1966 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1967 server.dbnum = atoi(argv[1]);
1968 if (server.dbnum < 1) {
1969 err = "Invalid number of databases"; goto loaderr;
1970 }
b3f83f12
JZ
1971 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1972 loadServerConfig(argv[1]);
285add55 1973 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1974 server.maxclients = atoi(argv[1]);
3fd78bcd 1975 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1976 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1977 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1978 server.masterhost = sdsnew(argv[1]);
1979 server.masterport = atoi(argv[2]);
1980 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1981 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1982 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1983 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1984 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1985 err = "argument must be 'yes' or 'no'"; goto loaderr;
1986 }
121f70cf 1987 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1988 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1989 err = "argument must be 'yes' or 'no'"; goto loaderr;
1990 }
1991 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1992 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1993 err = "argument must be 'yes' or 'no'"; goto loaderr;
1994 }
bb0b03a3 1995 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1996 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1997 err = "argument must be 'yes' or 'no'"; goto loaderr;
1998 }
44b38ef4 1999 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
2000 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
2001 err = "argument must be 'yes' or 'no'"; goto loaderr;
2002 }
f3b52411
PN
2003 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
2004 zfree(server.appendfilename);
2005 server.appendfilename = zstrdup(argv[1]);
38db9171 2006 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2007 && argc == 2) {
2008 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2009 err = "argument must be 'yes' or 'no'"; goto loaderr;
2010 }
48f0308a 2011 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 2012 if (!strcasecmp(argv[1],"no")) {
48f0308a 2013 server.appendfsync = APPENDFSYNC_NO;
1766c6da 2014 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 2015 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 2016 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 2017 server.appendfsync = APPENDFSYNC_EVERYSEC;
2018 } else {
2019 err = "argument must be 'no', 'always' or 'everysec'";
2020 goto loaderr;
2021 }
bb0b03a3 2022 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 2023 server.requirepass = zstrdup(argv[1]);
bb0b03a3 2024 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 2025 zfree(server.pidfile);
054e426d 2026 server.pidfile = zstrdup(argv[1]);
bb0b03a3 2027 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 2028 zfree(server.dbfilename);
054e426d 2029 server.dbfilename = zstrdup(argv[1]);
75680a3c 2030 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2031 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2032 err = "argument must be 'yes' or 'no'"; goto loaderr;
2033 }
054e426d 2034 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 2035 zfree(server.vm_swap_file);
054e426d 2036 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 2037 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 2038 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 2039 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 2040 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 2041 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 2042 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 2043 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2044 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 2045 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 2046 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 2047 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 2048 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
d0686e07
PN
2049 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2050 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2051 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2052 server.list_max_ziplist_value = memtoll(argv[1], NULL);
70ff3511
PN
2053 } else if (!strcasecmp(argv[0],"set-max-intset-entries") && argc == 2){
2054 server.set_max_intset_entries = memtoll(argv[1], NULL);
ed9b544e 2055 } else {
2056 err = "Bad directive or wrong number of arguments"; goto loaderr;
2057 }
2058 for (j = 0; j < argc; j++)
2059 sdsfree(argv[j]);
2060 zfree(argv);
2061 sdsfree(line);
2062 }
c9a111ac 2063 if (fp != stdin) fclose(fp);
ed9b544e 2064 return;
2065
2066loaderr:
2067 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2068 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2069 fprintf(stderr, ">>> '%s'\n", line);
2070 fprintf(stderr, "%s\n", err);
2071 exit(1);
2072}
2073
2074static void freeClientArgv(redisClient *c) {
2075 int j;
2076
2077 for (j = 0; j < c->argc; j++)
2078 decrRefCount(c->argv[j]);
e8a74421 2079 for (j = 0; j < c->mbargc; j++)
2080 decrRefCount(c->mbargv[j]);
ed9b544e 2081 c->argc = 0;
e8a74421 2082 c->mbargc = 0;
ed9b544e 2083}
2084
2085static void freeClient(redisClient *c) {
2086 listNode *ln;
2087
4409877e 2088 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2089 * call, we have to set querybuf to NULL *before* to call
2090 * unblockClientWaitingData() to avoid processInputBuffer() will get
2091 * called. Also it is important to remove the file events after
2092 * this, because this call adds the READABLE event. */
4409877e 2093 sdsfree(c->querybuf);
2094 c->querybuf = NULL;
2095 if (c->flags & REDIS_BLOCKED)
b0d8747d 2096 unblockClientWaitingData(c);
4409877e 2097
37ab76c9 2098 /* UNWATCH all the keys */
2099 unwatchAllKeys(c);
2100 listRelease(c->watched_keys);
ffc6b7f8 2101 /* Unsubscribe from all the pubsub channels */
2102 pubsubUnsubscribeAllChannels(c,0);
2103 pubsubUnsubscribeAllPatterns(c,0);
2104 dictRelease(c->pubsub_channels);
2105 listRelease(c->pubsub_patterns);
befec3cd 2106 /* Obvious cleanup */
ed9b544e 2107 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2108 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2109 listRelease(c->reply);
2110 freeClientArgv(c);
2111 close(c->fd);
92f8e882 2112 /* Remove from the list of clients */
ed9b544e 2113 ln = listSearchKey(server.clients,c);
dfc5e96c 2114 redisAssert(ln != NULL);
ed9b544e 2115 listDelNode(server.clients,ln);
37ab76c9 2116 /* Remove from the list of clients that are now ready to be restarted
2117 * after waiting for swapped keys */
d5d55fc3 2118 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2119 ln = listSearchKey(server.io_ready_clients,c);
2120 if (ln) {
2121 listDelNode(server.io_ready_clients,ln);
2122 server.vm_blocked_clients--;
2123 }
2124 }
37ab76c9 2125 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2126 while (server.vm_enabled && listLength(c->io_keys)) {
2127 ln = listFirst(c->io_keys);
2128 dontWaitForSwappedKey(c,ln->value);
92f8e882 2129 }
b3e3d0d7 2130 listRelease(c->io_keys);
befec3cd 2131 /* Master/slave cleanup */
ed9b544e 2132 if (c->flags & REDIS_SLAVE) {
6208b3a7 2133 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2134 close(c->repldbfd);
87eca727 2135 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2136 ln = listSearchKey(l,c);
dfc5e96c 2137 redisAssert(ln != NULL);
87eca727 2138 listDelNode(l,ln);
ed9b544e 2139 }
2140 if (c->flags & REDIS_MASTER) {
2141 server.master = NULL;
2142 server.replstate = REDIS_REPL_CONNECT;
2143 }
befec3cd 2144 /* Release memory */
93ea3759 2145 zfree(c->argv);
e8a74421 2146 zfree(c->mbargv);
6e469882 2147 freeClientMultiState(c);
ed9b544e 2148 zfree(c);
2149}
2150
cc30e368 2151#define GLUEREPLY_UP_TO (1024)
ed9b544e 2152static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2153 int copylen = 0;
2154 char buf[GLUEREPLY_UP_TO];
6208b3a7 2155 listNode *ln;
c7df85a4 2156 listIter li;
ed9b544e 2157 robj *o;
2158
c7df85a4 2159 listRewind(c->reply,&li);
2160 while((ln = listNext(&li))) {
c28b42ac 2161 int objlen;
2162
ed9b544e 2163 o = ln->value;
c28b42ac 2164 objlen = sdslen(o->ptr);
2165 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2166 memcpy(buf+copylen,o->ptr,objlen);
2167 copylen += objlen;
ed9b544e 2168 listDelNode(c->reply,ln);
c28b42ac 2169 } else {
2170 if (copylen == 0) return;
2171 break;
ed9b544e 2172 }
ed9b544e 2173 }
c28b42ac 2174 /* Now the output buffer is empty, add the new single element */
2175 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2176 listAddNodeHead(c->reply,o);
ed9b544e 2177}
2178
2179static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2180 redisClient *c = privdata;
2181 int nwritten = 0, totwritten = 0, objlen;
2182 robj *o;
2183 REDIS_NOTUSED(el);
2184 REDIS_NOTUSED(mask);
2185
2895e862 2186 /* Use writev() if we have enough buffers to send */
7ea870c0 2187 if (!server.glueoutputbuf &&
e0a62c7f 2188 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2189 !(c->flags & REDIS_MASTER))
2895e862 2190 {
2191 sendReplyToClientWritev(el, fd, privdata, mask);
2192 return;
2193 }
2895e862 2194
ed9b544e 2195 while(listLength(c->reply)) {
c28b42ac 2196 if (server.glueoutputbuf && listLength(c->reply) > 1)
2197 glueReplyBuffersIfNeeded(c);
2198
ed9b544e 2199 o = listNodeValue(listFirst(c->reply));
2200 objlen = sdslen(o->ptr);
2201
2202 if (objlen == 0) {
2203 listDelNode(c->reply,listFirst(c->reply));
2204 continue;
2205 }
2206
2207 if (c->flags & REDIS_MASTER) {
6f376729 2208 /* Don't reply to a master */
ed9b544e 2209 nwritten = objlen - c->sentlen;
2210 } else {
a4d1ba9a 2211 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2212 if (nwritten <= 0) break;
2213 }
2214 c->sentlen += nwritten;
2215 totwritten += nwritten;
2216 /* If we fully sent the object on head go to the next one */
2217 if (c->sentlen == objlen) {
2218 listDelNode(c->reply,listFirst(c->reply));
2219 c->sentlen = 0;
2220 }
6f376729 2221 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2222 * bytes, in a single threaded server it's a good idea to serve
6f376729 2223 * other clients as well, even if a very large request comes from
2224 * super fast link that is always able to accept data (in real world
12f9d551 2225 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2226 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2227 }
2228 if (nwritten == -1) {
2229 if (errno == EAGAIN) {
2230 nwritten = 0;
2231 } else {
f870935d 2232 redisLog(REDIS_VERBOSE,
ed9b544e 2233 "Error writing to client: %s", strerror(errno));
2234 freeClient(c);
2235 return;
2236 }
2237 }
2238 if (totwritten > 0) c->lastinteraction = time(NULL);
2239 if (listLength(c->reply) == 0) {
2240 c->sentlen = 0;
2241 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2242 }
2243}
2244
2895e862 2245static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2246{
2247 redisClient *c = privdata;
2248 int nwritten = 0, totwritten = 0, objlen, willwrite;
2249 robj *o;
2250 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2251 int offset, ion = 0;
2252 REDIS_NOTUSED(el);
2253 REDIS_NOTUSED(mask);
2254
2255 listNode *node;
2256 while (listLength(c->reply)) {
2257 offset = c->sentlen;
2258 ion = 0;
2259 willwrite = 0;
2260
2261 /* fill-in the iov[] array */
2262 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2263 o = listNodeValue(node);
2264 objlen = sdslen(o->ptr);
2265
e0a62c7f 2266 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2267 break;
2268
2269 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2270 break; /* no more iovecs */
2271
2272 iov[ion].iov_base = ((char*)o->ptr) + offset;
2273 iov[ion].iov_len = objlen - offset;
2274 willwrite += objlen - offset;
2275 offset = 0; /* just for the first item */
2276 ion++;
2277 }
2278
2279 if(willwrite == 0)
2280 break;
2281
2282 /* write all collected blocks at once */
2283 if((nwritten = writev(fd, iov, ion)) < 0) {
2284 if (errno != EAGAIN) {
f870935d 2285 redisLog(REDIS_VERBOSE,
2895e862 2286 "Error writing to client: %s", strerror(errno));
2287 freeClient(c);
2288 return;
2289 }
2290 break;
2291 }
2292
2293 totwritten += nwritten;
2294 offset = c->sentlen;
2295
2296 /* remove written robjs from c->reply */
2297 while (nwritten && listLength(c->reply)) {
2298 o = listNodeValue(listFirst(c->reply));
2299 objlen = sdslen(o->ptr);
2300
2301 if(nwritten >= objlen - offset) {
2302 listDelNode(c->reply, listFirst(c->reply));
2303 nwritten -= objlen - offset;
2304 c->sentlen = 0;
2305 } else {
2306 /* partial write */
2307 c->sentlen += nwritten;
2308 break;
2309 }
2310 offset = 0;
2311 }
2312 }
2313
e0a62c7f 2314 if (totwritten > 0)
2895e862 2315 c->lastinteraction = time(NULL);
2316
2317 if (listLength(c->reply) == 0) {
2318 c->sentlen = 0;
2319 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2320 }
2321}
2322
1a132bbc
PN
2323static int qsortRedisCommands(const void *r1, const void *r2) {
2324 return strcasecmp(
2325 ((struct redisCommand*)r1)->name,
2326 ((struct redisCommand*)r2)->name);
2327}
2328
2329static void sortCommandTable() {
1a132bbc
PN
2330 /* Copy and sort the read-only version of the command table */
2331 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2332 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
d55d5c5d 2333 qsort(commandTable,
2334 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2335 sizeof(struct redisCommand),qsortRedisCommands);
1a132bbc
PN
2336}
2337
ed9b544e 2338static struct redisCommand *lookupCommand(char *name) {
1a132bbc
PN
2339 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2340 return bsearch(
2341 &tmp,
2342 commandTable,
d55d5c5d 2343 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
1a132bbc
PN
2344 sizeof(struct redisCommand),
2345 qsortRedisCommands);
ed9b544e 2346}
2347
2348/* resetClient prepare the client to process the next command */
2349static void resetClient(redisClient *c) {
2350 freeClientArgv(c);
2351 c->bulklen = -1;
e8a74421 2352 c->multibulk = 0;
ed9b544e 2353}
2354
6e469882 2355/* Call() is the core of Redis execution of a command */
2356static void call(redisClient *c, struct redisCommand *cmd) {
2357 long long dirty;
2358
2359 dirty = server.dirty;
2360 cmd->proc(c);
4005fef1 2361 dirty = server.dirty-dirty;
2362
2363 if (server.appendonly && dirty)
6e469882 2364 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2365 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2366 listLength(server.slaves))
248ea310 2367 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2368 if (listLength(server.monitors))
dd142b9c 2369 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2370 server.stat_numcommands++;
2371}
2372
ed9b544e 2373/* If this function gets called we already read a whole
2374 * command, argments are in the client argv/argc fields.
2375 * processCommand() execute the command or prepare the
2376 * server for a bulk read from the client.
2377 *
2378 * If 1 is returned the client is still alive and valid and
2379 * and other operations can be performed by the caller. Otherwise
2380 * if 0 is returned the client was destroied (i.e. after QUIT). */
2381static int processCommand(redisClient *c) {
2382 struct redisCommand *cmd;
ed9b544e 2383
3fd78bcd 2384 /* Free some memory if needed (maxmemory setting) */
2385 if (server.maxmemory) freeMemoryIfNeeded();
2386
e8a74421 2387 /* Handle the multi bulk command type. This is an alternative protocol
2388 * supported by Redis in order to receive commands that are composed of
2389 * multiple binary-safe "bulk" arguments. The latency of processing is
2390 * a bit higher but this allows things like multi-sets, so if this
2391 * protocol is used only for MSET and similar commands this is a big win. */
2392 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2393 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2394 if (c->multibulk <= 0) {
2395 resetClient(c);
2396 return 1;
2397 } else {
2398 decrRefCount(c->argv[c->argc-1]);
2399 c->argc--;
2400 return 1;
2401 }
2402 } else if (c->multibulk) {
2403 if (c->bulklen == -1) {
2404 if (((char*)c->argv[0]->ptr)[0] != '$') {
2405 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2406 resetClient(c);
2407 return 1;
2408 } else {
2409 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2410 decrRefCount(c->argv[0]);
2411 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2412 c->argc--;
2413 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2414 resetClient(c);
2415 return 1;
2416 }
2417 c->argc--;
2418 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2419 return 1;
2420 }
2421 } else {
2422 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2423 c->mbargv[c->mbargc] = c->argv[0];
2424 c->mbargc++;
2425 c->argc--;
2426 c->multibulk--;
2427 if (c->multibulk == 0) {
2428 robj **auxargv;
2429 int auxargc;
2430
2431 /* Here we need to swap the multi-bulk argc/argv with the
2432 * normal argc/argv of the client structure. */
2433 auxargv = c->argv;
2434 c->argv = c->mbargv;
2435 c->mbargv = auxargv;
2436
2437 auxargc = c->argc;
2438 c->argc = c->mbargc;
2439 c->mbargc = auxargc;
2440
2441 /* We need to set bulklen to something different than -1
2442 * in order for the code below to process the command without
2443 * to try to read the last argument of a bulk command as
2444 * a special argument. */
2445 c->bulklen = 0;
2446 /* continue below and process the command */
2447 } else {
2448 c->bulklen = -1;
2449 return 1;
2450 }
2451 }
2452 }
2453 /* -- end of multi bulk commands processing -- */
2454
ed9b544e 2455 /* The QUIT command is handled as a special case. Normal command
2456 * procs are unable to close the client connection safely */
bb0b03a3 2457 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2458 freeClient(c);
2459 return 0;
2460 }
d5d55fc3 2461
2462 /* Now lookup the command and check ASAP about trivial error conditions
2463 * such wrong arity, bad command name and so forth. */
ed9b544e 2464 cmd = lookupCommand(c->argv[0]->ptr);
2465 if (!cmd) {
2c14807b 2466 addReplySds(c,
2467 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2468 (char*)c->argv[0]->ptr));
ed9b544e 2469 resetClient(c);
2470 return 1;
2471 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2472 (c->argc < -cmd->arity)) {
454d4e43 2473 addReplySds(c,
2474 sdscatprintf(sdsempty(),
2475 "-ERR wrong number of arguments for '%s' command\r\n",
2476 cmd->name));
ed9b544e 2477 resetClient(c);
2478 return 1;
2479 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2480 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2481 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2482
2483 decrRefCount(c->argv[c->argc-1]);
2484 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2485 c->argc--;
2486 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2487 resetClient(c);
2488 return 1;
2489 }
2490 c->argc--;
2491 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2492 /* It is possible that the bulk read is already in the
8d0490e7 2493 * buffer. Check this condition and handle it accordingly.
2494 * This is just a fast path, alternative to call processInputBuffer().
2495 * It's a good idea since the code is small and this condition
2496 * happens most of the times. */
ed9b544e 2497 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2498 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2499 c->argc++;
2500 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2501 } else {
d5d55fc3 2502 /* Otherwise return... there is to read the last argument
2503 * from the socket. */
ed9b544e 2504 return 1;
2505 }
2506 }
942a3961 2507 /* Let's try to encode the bulk object to save space. */
2508 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2509 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2510
e63943a4 2511 /* Check if the user is authenticated */
2512 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2513 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2514 resetClient(c);
2515 return 1;
2516 }
2517
b61a28fe 2518 /* Handle the maxmemory directive */
2519 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2520 zmalloc_used_memory() > server.maxmemory)
2521 {
2522 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2523 resetClient(c);
2524 return 1;
2525 }
2526
d6cc8867 2527 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2528 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2529 &&
ffc6b7f8 2530 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2531 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2532 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2533 resetClient(c);
2534 return 1;
2535 }
2536
ed9b544e 2537 /* Exec the command */
6531c94d 2538 if (c->flags & REDIS_MULTI &&
2539 cmd->proc != execCommand && cmd->proc != discardCommand &&
2540 cmd->proc != multiCommand && cmd->proc != watchCommand)
2541 {
6e469882 2542 queueMultiCommand(c,cmd);
2543 addReply(c,shared.queued);
2544 } else {
d5d55fc3 2545 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2546 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2547 call(c,cmd);
2548 }
ed9b544e 2549
2550 /* Prepare the client for the next command */
ed9b544e 2551 resetClient(c);
2552 return 1;
2553}
2554
248ea310 2555static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2556 listNode *ln;
c7df85a4 2557 listIter li;
ed9b544e 2558 int outc = 0, j;
93ea3759 2559 robj **outv;
248ea310 2560 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2561 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2562 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2563 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2564 robj *lenobj;
93ea3759 2565
2566 if (argc <= REDIS_STATIC_ARGS) {
2567 outv = static_outv;
2568 } else {
248ea310 2569 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2570 }
248ea310 2571
2572 lenobj = createObject(REDIS_STRING,
2573 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2574 lenobj->refcount = 0;
2575 outv[outc++] = lenobj;
ed9b544e 2576 for (j = 0; j < argc; j++) {
248ea310 2577 lenobj = createObject(REDIS_STRING,
2578 sdscatprintf(sdsempty(),"$%lu\r\n",
2579 (unsigned long) stringObjectLen(argv[j])));
2580 lenobj->refcount = 0;
2581 outv[outc++] = lenobj;
ed9b544e 2582 outv[outc++] = argv[j];
248ea310 2583 outv[outc++] = shared.crlf;
ed9b544e 2584 }
ed9b544e 2585
40d224a9 2586 /* Increment all the refcounts at start and decrement at end in order to
2587 * be sure to free objects if there is no slave in a replication state
2588 * able to be feed with commands */
2589 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2590 listRewind(slaves,&li);
2591 while((ln = listNext(&li))) {
ed9b544e 2592 redisClient *slave = ln->value;
40d224a9 2593
2594 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2595 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2596
2597 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2598 if (slave->slaveseldb != dictid) {
2599 robj *selectcmd;
2600
2601 switch(dictid) {
2602 case 0: selectcmd = shared.select0; break;
2603 case 1: selectcmd = shared.select1; break;
2604 case 2: selectcmd = shared.select2; break;
2605 case 3: selectcmd = shared.select3; break;
2606 case 4: selectcmd = shared.select4; break;
2607 case 5: selectcmd = shared.select5; break;
2608 case 6: selectcmd = shared.select6; break;
2609 case 7: selectcmd = shared.select7; break;
2610 case 8: selectcmd = shared.select8; break;
2611 case 9: selectcmd = shared.select9; break;
2612 default:
2613 selectcmd = createObject(REDIS_STRING,
2614 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2615 selectcmd->refcount = 0;
2616 break;
2617 }
2618 addReply(slave,selectcmd);
2619 slave->slaveseldb = dictid;
2620 }
2621 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2622 }
40d224a9 2623 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2624 if (outv != static_outv) zfree(outv);
ed9b544e 2625}
2626
dd142b9c 2627static sds sdscatrepr(sds s, char *p, size_t len) {
2628 s = sdscatlen(s,"\"",1);
2629 while(len--) {
2630 switch(*p) {
2631 case '\\':
2632 case '"':
2633 s = sdscatprintf(s,"\\%c",*p);
2634 break;
2635 case '\n': s = sdscatlen(s,"\\n",1); break;
2636 case '\r': s = sdscatlen(s,"\\r",1); break;
2637 case '\t': s = sdscatlen(s,"\\t",1); break;
2638 case '\a': s = sdscatlen(s,"\\a",1); break;
2639 case '\b': s = sdscatlen(s,"\\b",1); break;
2640 default:
2641 if (isprint(*p))
2642 s = sdscatprintf(s,"%c",*p);
2643 else
2644 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2645 break;
2646 }
2647 p++;
2648 }
2649 return sdscatlen(s,"\"",1);
2650}
2651
2652static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2653 listNode *ln;
2654 listIter li;
2655 int j;
2656 sds cmdrepr = sdsnew("+");
2657 robj *cmdobj;
2658 struct timeval tv;
2659
2660 gettimeofday(&tv,NULL);
2661 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2662 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2663
2664 for (j = 0; j < argc; j++) {
2665 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2666 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2667 } else {
2668 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2669 sdslen(argv[j]->ptr));
2670 }
2671 if (j != argc-1)
2672 cmdrepr = sdscatlen(cmdrepr," ",1);
2673 }
2674 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2675 cmdobj = createObject(REDIS_STRING,cmdrepr);
2676
2677 listRewind(monitors,&li);
2678 while((ln = listNext(&li))) {
2679 redisClient *monitor = ln->value;
2680 addReply(monitor,cmdobj);
2681 }
2682 decrRefCount(cmdobj);
2683}
2684
638e42ac 2685static void processInputBuffer(redisClient *c) {
ed9b544e 2686again:
4409877e 2687 /* Before to process the input buffer, make sure the client is not
2688 * waitig for a blocking operation such as BLPOP. Note that the first
2689 * iteration the client is never blocked, otherwise the processInputBuffer
2690 * would not be called at all, but after the execution of the first commands
2691 * in the input buffer the client may be blocked, and the "goto again"
2692 * will try to reiterate. The following line will make it return asap. */
92f8e882 2693 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2694 if (c->bulklen == -1) {
2695 /* Read the first line of the query */
2696 char *p = strchr(c->querybuf,'\n');
2697 size_t querylen;
644fafa3 2698
ed9b544e 2699 if (p) {
2700 sds query, *argv;
2701 int argc, j;
e0a62c7f 2702
ed9b544e 2703 query = c->querybuf;
2704 c->querybuf = sdsempty();
2705 querylen = 1+(p-(query));
2706 if (sdslen(query) > querylen) {
2707 /* leave data after the first line of the query in the buffer */
2708 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2709 }
2710 *p = '\0'; /* remove "\n" */
2711 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2712 sdsupdatelen(query);
2713
2714 /* Now we can split the query in arguments */
ed9b544e 2715 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2716 sdsfree(query);
2717
2718 if (c->argv) zfree(c->argv);
2719 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2720
2721 for (j = 0; j < argc; j++) {
ed9b544e 2722 if (sdslen(argv[j])) {
2723 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2724 c->argc++;
2725 } else {
2726 sdsfree(argv[j]);
2727 }
2728 }
2729 zfree(argv);
7c49733c 2730 if (c->argc) {
2731 /* Execute the command. If the client is still valid
2732 * after processCommand() return and there is something
2733 * on the query buffer try to process the next command. */
2734 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2735 } else {
2736 /* Nothing to process, argc == 0. Just process the query
2737 * buffer if it's not empty or return to the caller */
2738 if (sdslen(c->querybuf)) goto again;
2739 }
ed9b544e 2740 return;
644fafa3 2741 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2742 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2743 freeClient(c);
2744 return;
2745 }
2746 } else {
2747 /* Bulk read handling. Note that if we are at this point
2748 the client already sent a command terminated with a newline,
2749 we are reading the bulk data that is actually the last
2750 argument of the command. */
2751 int qbl = sdslen(c->querybuf);
2752
2753 if (c->bulklen <= qbl) {
2754 /* Copy everything but the final CRLF as final argument */
2755 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2756 c->argc++;
2757 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2758 /* Process the command. If the client is still valid after
2759 * the processing and there is more data in the buffer
2760 * try to parse it. */
2761 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2762 return;
2763 }
2764 }
2765}
2766
638e42ac 2767static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2768 redisClient *c = (redisClient*) privdata;
2769 char buf[REDIS_IOBUF_LEN];
2770 int nread;
2771 REDIS_NOTUSED(el);
2772 REDIS_NOTUSED(mask);
2773
2774 nread = read(fd, buf, REDIS_IOBUF_LEN);
2775 if (nread == -1) {
2776 if (errno == EAGAIN) {
2777 nread = 0;
2778 } else {
f870935d 2779 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2780 freeClient(c);
2781 return;
2782 }
2783 } else if (nread == 0) {
f870935d 2784 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2785 freeClient(c);
2786 return;
2787 }
2788 if (nread) {
2789 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2790 c->lastinteraction = time(NULL);
2791 } else {
2792 return;
2793 }
168ac5c6 2794 processInputBuffer(c);
638e42ac 2795}
2796
ed9b544e 2797static int selectDb(redisClient *c, int id) {
2798 if (id < 0 || id >= server.dbnum)
2799 return REDIS_ERR;
3305306f 2800 c->db = &server.db[id];
ed9b544e 2801 return REDIS_OK;
2802}
2803
40d224a9 2804static void *dupClientReplyValue(void *o) {
2805 incrRefCount((robj*)o);
12d090d2 2806 return o;
40d224a9 2807}
2808
ffc6b7f8 2809static int listMatchObjects(void *a, void *b) {
bf028098 2810 return equalStringObjects(a,b);
ffc6b7f8 2811}
2812
ed9b544e 2813static redisClient *createClient(int fd) {
2814 redisClient *c = zmalloc(sizeof(*c));
2815
2816 anetNonBlock(NULL,fd);
2817 anetTcpNoDelay(NULL,fd);
2818 if (!c) return NULL;
2819 selectDb(c,0);
2820 c->fd = fd;
2821 c->querybuf = sdsempty();
2822 c->argc = 0;
93ea3759 2823 c->argv = NULL;
ed9b544e 2824 c->bulklen = -1;
e8a74421 2825 c->multibulk = 0;
2826 c->mbargc = 0;
2827 c->mbargv = NULL;
ed9b544e 2828 c->sentlen = 0;
2829 c->flags = 0;
2830 c->lastinteraction = time(NULL);
abcb223e 2831 c->authenticated = 0;
40d224a9 2832 c->replstate = REDIS_REPL_NONE;
6b47e12e 2833 c->reply = listCreate();
ed9b544e 2834 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2835 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2836 c->blocking_keys = NULL;
2837 c->blocking_keys_num = 0;
92f8e882 2838 c->io_keys = listCreate();
87c68815 2839 c->watched_keys = listCreate();
92f8e882 2840 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2841 c->pubsub_channels = dictCreate(&setDictType,NULL);
2842 c->pubsub_patterns = listCreate();
2843 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2844 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2845 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2846 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2847 freeClient(c);
2848 return NULL;
2849 }
6b47e12e 2850 listAddNodeTail(server.clients,c);
6e469882 2851 initClientMultiState(c);
ed9b544e 2852 return c;
2853}
2854
2855static void addReply(redisClient *c, robj *obj) {
2856 if (listLength(c->reply) == 0 &&
6208b3a7 2857 (c->replstate == REDIS_REPL_NONE ||
2858 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2859 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2860 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2861
2862 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2863 obj = dupStringObject(obj);
2864 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2865 }
9d65a1bb 2866 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2867}
2868
2869static void addReplySds(redisClient *c, sds s) {
2870 robj *o = createObject(REDIS_STRING,s);
2871 addReply(c,o);
2872 decrRefCount(o);
2873}
2874
e2665397 2875static void addReplyDouble(redisClient *c, double d) {
2876 char buf[128];
2877
2878 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2879 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2880 (unsigned long) strlen(buf),buf));
e2665397 2881}
2882
aa7c2934
PN
2883static void addReplyLongLong(redisClient *c, long long ll) {
2884 char buf[128];
2885 size_t len;
2886
2887 if (ll == 0) {
2888 addReply(c,shared.czero);
2889 return;
2890 } else if (ll == 1) {
2891 addReply(c,shared.cone);
2892 return;
2893 }
482b672d 2894 buf[0] = ':';
2895 len = ll2string(buf+1,sizeof(buf)-1,ll);
2896 buf[len+1] = '\r';
2897 buf[len+2] = '\n';
2898 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2899}
2900
92b27fe9 2901static void addReplyUlong(redisClient *c, unsigned long ul) {
2902 char buf[128];
2903 size_t len;
2904
dd88747b 2905 if (ul == 0) {
2906 addReply(c,shared.czero);
2907 return;
2908 } else if (ul == 1) {
2909 addReply(c,shared.cone);
2910 return;
2911 }
92b27fe9 2912 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2913 addReplySds(c,sdsnewlen(buf,len));
2914}
2915
942a3961 2916static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2917 size_t len, intlen;
2918 char buf[128];
942a3961 2919
2920 if (obj->encoding == REDIS_ENCODING_RAW) {
2921 len = sdslen(obj->ptr);
2922 } else {
2923 long n = (long)obj->ptr;
2924
e054afda 2925 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2926 len = 1;
2927 if (n < 0) {
2928 len++;
2929 n = -n;
2930 }
2931 while((n = n/10) != 0) {
2932 len++;
2933 }
2934 }
482b672d 2935 buf[0] = '$';
2936 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2937 buf[intlen+1] = '\r';
2938 buf[intlen+2] = '\n';
2939 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2940}
2941
dd88747b 2942static void addReplyBulk(redisClient *c, robj *obj) {
2943 addReplyBulkLen(c,obj);
2944 addReply(c,obj);
2945 addReply(c,shared.crlf);
2946}
2947
09241813 2948static void addReplyBulkSds(redisClient *c, sds s) {
2949 robj *o = createStringObject(s, sdslen(s));
2950 addReplyBulk(c,o);
2951 decrRefCount(o);
2952}
2953
500ece7c 2954/* In the CONFIG command we need to add vanilla C string as bulk replies */
2955static void addReplyBulkCString(redisClient *c, char *s) {
2956 if (s == NULL) {
2957 addReply(c,shared.nullbulk);
2958 } else {
2959 robj *o = createStringObject(s,strlen(s));
2960 addReplyBulk(c,o);
2961 decrRefCount(o);
2962 }
2963}
2964
ed9b544e 2965static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2966 int cport, cfd;
2967 char cip[128];
285add55 2968 redisClient *c;
ed9b544e 2969 REDIS_NOTUSED(el);
2970 REDIS_NOTUSED(mask);
2971 REDIS_NOTUSED(privdata);
2972
2973 cfd = anetAccept(server.neterr, fd, cip, &cport);
2974 if (cfd == AE_ERR) {
f870935d 2975 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2976 return;
2977 }
f870935d 2978 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2979 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2980 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2981 close(cfd); /* May be already closed, just ingore errors */
2982 return;
2983 }
285add55 2984 /* If maxclient directive is set and this is one client more... close the
2985 * connection. Note that we create the client instead to check before
2986 * for this condition, since now the socket is already set in nonblocking
2987 * mode and we can send an error for free using the Kernel I/O */
2988 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2989 char *err = "-ERR max number of clients reached\r\n";
2990
2991 /* That's a best effort error message, don't check write errors */
fee803ba 2992 if (write(c->fd,err,strlen(err)) == -1) {
2993 /* Nothing to do, Just to avoid the warning... */
2994 }
285add55 2995 freeClient(c);
2996 return;
2997 }
ed9b544e 2998 server.stat_numconnections++;
2999}
3000
3001/* ======================= Redis objects implementation ===================== */
3002
3003static robj *createObject(int type, void *ptr) {
3004 robj *o;
3005
a5819310 3006 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3007 if (listLength(server.objfreelist)) {
3008 listNode *head = listFirst(server.objfreelist);
3009 o = listNodeValue(head);
3010 listDelNode(server.objfreelist,head);
a5819310 3011 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3012 } else {
560db612 3013 if (server.vm_enabled)
a5819310 3014 pthread_mutex_unlock(&server.obj_freelist_mutex);
560db612 3015 o = zmalloc(sizeof(*o));
ed9b544e 3016 }
ed9b544e 3017 o->type = type;
942a3961 3018 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 3019 o->ptr = ptr;
3020 o->refcount = 1;
3a66edc7 3021 if (server.vm_enabled) {
1064ef87 3022 /* Note that this code may run in the context of an I/O thread
560db612 3023 * and accessing server.lruclock in theory is an error
1064ef87 3024 * (no locks). But in practice this is safe, and even if we read
560db612 3025 * garbage Redis will not fail. */
3026 o->lru = server.lruclock;
3a66edc7 3027 o->storage = REDIS_VM_MEMORY;
3028 }
ed9b544e 3029 return o;
3030}
3031
3032static robj *createStringObject(char *ptr, size_t len) {
3033 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3034}
3035
3f973463
PN
3036static robj *createStringObjectFromLongLong(long long value) {
3037 robj *o;
3038 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3039 incrRefCount(shared.integers[value]);
3040 o = shared.integers[value];
3041 } else {
3f973463 3042 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 3043 o = createObject(REDIS_STRING, NULL);
3f973463
PN
3044 o->encoding = REDIS_ENCODING_INT;
3045 o->ptr = (void*)((long)value);
3046 } else {
ee14da56 3047 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
3048 }
3049 }
3050 return o;
3051}
3052
4ef8de8a 3053static robj *dupStringObject(robj *o) {
b9bc0eef 3054 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 3055 return createStringObject(o->ptr,sdslen(o->ptr));
3056}
3057
ed9b544e 3058static robj *createListObject(void) {
3059 list *l = listCreate();
1cd92e7f 3060 robj *o = createObject(REDIS_LIST,l);
ed9b544e 3061 listSetFreeMethod(l,decrRefCount);
1cd92e7f
PN
3062 o->encoding = REDIS_ENCODING_LIST;
3063 return o;
3064}
3065
3066static robj *createZiplistObject(void) {
3067 unsigned char *zl = ziplistNew();
3068 robj *o = createObject(REDIS_LIST,zl);
3069 o->encoding = REDIS_ENCODING_ZIPLIST;
3070 return o;
ed9b544e 3071}
3072
3073static robj *createSetObject(void) {
3074 dict *d = dictCreate(&setDictType,NULL);
35cabcb5
PN
3075 robj *o = createObject(REDIS_SET,d);
3076 o->encoding = REDIS_ENCODING_HT;
3077 return o;
ed9b544e 3078}
3079
d0b58d53
PN
3080static robj *createIntsetObject(void) {
3081 intset *is = intsetNew();
3082 robj *o = createObject(REDIS_SET,is);
3083 o->encoding = REDIS_ENCODING_INTSET;
3084 return o;
3085}
3086
5234952b 3087static robj *createHashObject(void) {
3088 /* All the Hashes start as zipmaps. Will be automatically converted
3089 * into hash tables if there are enough elements or big elements
3090 * inside. */
3091 unsigned char *zm = zipmapNew();
3092 robj *o = createObject(REDIS_HASH,zm);
3093 o->encoding = REDIS_ENCODING_ZIPMAP;
3094 return o;
3095}
3096
1812e024 3097static robj *createZsetObject(void) {
6b47e12e 3098 zset *zs = zmalloc(sizeof(*zs));
3099
3100 zs->dict = dictCreate(&zsetDictType,NULL);
3101 zs->zsl = zslCreate();
3102 return createObject(REDIS_ZSET,zs);
1812e024 3103}
3104
ed9b544e 3105static void freeStringObject(robj *o) {
942a3961 3106 if (o->encoding == REDIS_ENCODING_RAW) {
3107 sdsfree(o->ptr);
3108 }
ed9b544e 3109}
3110
3111static void freeListObject(robj *o) {
c7d9d662
PN
3112 switch (o->encoding) {
3113 case REDIS_ENCODING_LIST:
3114 listRelease((list*) o->ptr);
3115 break;
3116 case REDIS_ENCODING_ZIPLIST:
3117 zfree(o->ptr);
3118 break;
3119 default:
3120 redisPanic("Unknown list encoding type");
3121 }
ed9b544e 3122}
3123
3124static void freeSetObject(robj *o) {
d0b58d53
PN
3125 switch (o->encoding) {
3126 case REDIS_ENCODING_HT:
3127 dictRelease((dict*) o->ptr);
3128 break;
3129 case REDIS_ENCODING_INTSET:
3130 zfree(o->ptr);
3131 break;
3132 default:
3133 redisPanic("Unknown set encoding type");
3134 }
ed9b544e 3135}
3136
fd8ccf44 3137static void freeZsetObject(robj *o) {
3138 zset *zs = o->ptr;
3139
3140 dictRelease(zs->dict);
3141 zslFree(zs->zsl);
3142 zfree(zs);
3143}
3144
ed9b544e 3145static void freeHashObject(robj *o) {
cbba7dd7 3146 switch (o->encoding) {
3147 case REDIS_ENCODING_HT:
3148 dictRelease((dict*) o->ptr);
3149 break;
3150 case REDIS_ENCODING_ZIPMAP:
3151 zfree(o->ptr);
3152 break;
3153 default:
f83c6cb5 3154 redisPanic("Unknown hash encoding type");
cbba7dd7 3155 break;
3156 }
ed9b544e 3157}
3158
3159static void incrRefCount(robj *o) {
3160 o->refcount++;
3161}
3162
3163static void decrRefCount(void *obj) {
3164 robj *o = obj;
94754ccc 3165
560db612 3166 /* Object is a swapped out value, or in the process of being loaded. */
996cb5f7 3167 if (server.vm_enabled &&
3168 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3169 {
560db612 3170 vmpointer *vp = obj;
3171 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3172 vmMarkPagesFree(vp->page,vp->usedpages);
7d98e08c 3173 server.vm_stats_swapped_objects--;
560db612 3174 zfree(vp);
a35ddf12 3175 return;
3176 }
560db612 3177
3178 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
e4ed181d 3179 /* Object is in memory, or in the process of being swapped out.
3180 *
3181 * If the object is being swapped out, abort the operation on
3182 * decrRefCount even if the refcount does not drop to 0: the object
3183 * is referenced at least two times, as value of the key AND as
3184 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3185 * done but the relevant key was removed in the meantime, the
3186 * complete jobs handler will not find the key about the job and the
3187 * assert will fail. */
3188 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3189 vmCancelThreadedIOJob(o);
ed9b544e 3190 if (--(o->refcount) == 0) {
3191 switch(o->type) {
3192 case REDIS_STRING: freeStringObject(o); break;
3193 case REDIS_LIST: freeListObject(o); break;
3194 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3195 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3196 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3197 default: redisPanic("Unknown object type"); break;
ed9b544e 3198 }
a5819310 3199 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3200 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3201 !listAddNodeHead(server.objfreelist,o))
3202 zfree(o);
a5819310 3203 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3204 }
3205}
3206
92b27fe9 3207static int checkType(redisClient *c, robj *o, int type) {
3208 if (o->type != type) {
3209 addReply(c,shared.wrongtypeerr);
3210 return 1;
3211 }
3212 return 0;
3213}
3214
724a51b1 3215/* Check if the nul-terminated string 's' can be represented by a long
3216 * (that is, is a number that fits into long without any other space or
3217 * character before or after the digits).
3218 *
3219 * If so, the function returns REDIS_OK and *longval is set to the value
3220 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3221static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3222 char buf[32], *endptr;
3223 long value;
3224 int slen;
e0a62c7f 3225
724a51b1 3226 value = strtol(s, &endptr, 10);
3227 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3228 slen = ll2string(buf,32,value);
724a51b1 3229
3230 /* If the number converted back into a string is not identical
3231 * then it's not possible to encode the string as integer */
f69f2cba 3232 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3233 if (longval) *longval = value;
3234 return REDIS_OK;
3235}
3236
942a3961 3237/* Try to encode a string object in order to save space */
05df7621 3238static robj *tryObjectEncoding(robj *o) {
942a3961 3239 long value;
942a3961 3240 sds s = o->ptr;
3305306f 3241
942a3961 3242 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3243 return o; /* Already encoded */
3305306f 3244
05df7621 3245 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3246 * everywhere in the "object space" of Redis. Encoded objects can only
3247 * appear as "values" (and not, for instance, as keys) */
05df7621 3248 if (o->refcount > 1) return o;
3305306f 3249
942a3961 3250 /* Currently we try to encode only strings */
dfc5e96c 3251 redisAssert(o->type == REDIS_STRING);
94754ccc 3252
724a51b1 3253 /* Check if we can represent this string as a long integer */
05df7621 3254 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3255
3256 /* Ok, this object can be encoded */
05df7621 3257 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3258 decrRefCount(o);
3259 incrRefCount(shared.integers[value]);
3260 return shared.integers[value];
3261 } else {
3262 o->encoding = REDIS_ENCODING_INT;
3263 sdsfree(o->ptr);
3264 o->ptr = (void*) value;
3265 return o;
3266 }
942a3961 3267}
3268
9d65a1bb 3269/* Get a decoded version of an encoded object (returned as a new object).
3270 * If the object is already raw-encoded just increment the ref count. */
3271static robj *getDecodedObject(robj *o) {
942a3961 3272 robj *dec;
e0a62c7f 3273
9d65a1bb 3274 if (o->encoding == REDIS_ENCODING_RAW) {
3275 incrRefCount(o);
3276 return o;
3277 }
942a3961 3278 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3279 char buf[32];
3280
ee14da56 3281 ll2string(buf,32,(long)o->ptr);
942a3961 3282 dec = createStringObject(buf,strlen(buf));
3283 return dec;
3284 } else {
08ee9b57 3285 redisPanic("Unknown encoding type");
942a3961 3286 }
3305306f 3287}
3288
d7f43c08 3289/* Compare two string objects via strcmp() or alike.
3290 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3291 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3292 * and compare the strings, it's much faster than calling getDecodedObject().
3293 *
3294 * Important note: if objects are not integer encoded, but binary-safe strings,
3295 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3296 * binary safe. */
724a51b1 3297static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3298 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3299 char bufa[128], bufb[128], *astr, *bstr;
3300 int bothsds = 1;
724a51b1 3301
e197b441 3302 if (a == b) return 0;
d7f43c08 3303 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3304 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3305 astr = bufa;
3306 bothsds = 0;
724a51b1 3307 } else {
d7f43c08 3308 astr = a->ptr;
724a51b1 3309 }
d7f43c08 3310 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3311 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3312 bstr = bufb;
3313 bothsds = 0;
3314 } else {
3315 bstr = b->ptr;
3316 }
3317 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3318}
3319
bf028098 3320/* Equal string objects return 1 if the two objects are the same from the
3321 * point of view of a string comparison, otherwise 0 is returned. Note that
3322 * this function is faster then checking for (compareStringObject(a,b) == 0)
3323 * because it can perform some more optimization. */
3324static int equalStringObjects(robj *a, robj *b) {
3325 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3326 return a->ptr == b->ptr;
3327 } else {
3328 return compareStringObjects(a,b) == 0;
3329 }
3330}
3331
0ea663ea 3332static size_t stringObjectLen(robj *o) {
dfc5e96c 3333 redisAssert(o->type == REDIS_STRING);
0ea663ea 3334 if (o->encoding == REDIS_ENCODING_RAW) {
3335 return sdslen(o->ptr);
3336 } else {
3337 char buf[32];
3338
ee14da56 3339 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3340 }
3341}
3342
bd79a6bd
PN
3343static int getDoubleFromObject(robj *o, double *target) {
3344 double value;
682c73e8 3345 char *eptr;
bbe025e0 3346
bd79a6bd
PN
3347 if (o == NULL) {
3348 value = 0;
3349 } else {
3350 redisAssert(o->type == REDIS_STRING);
3351 if (o->encoding == REDIS_ENCODING_RAW) {
3352 value = strtod(o->ptr, &eptr);
682c73e8 3353 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3354 } else if (o->encoding == REDIS_ENCODING_INT) {
3355 value = (long)o->ptr;
3356 } else {
946342c1 3357 redisPanic("Unknown string encoding");
bd79a6bd
PN
3358 }
3359 }
3360
bd79a6bd
PN
3361 *target = value;
3362 return REDIS_OK;
3363}
bbe025e0 3364
bd79a6bd
PN
3365static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3366 double value;
3367 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3368 if (msg != NULL) {
3369 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3370 } else {
3371 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3372 }
bbe025e0
AM
3373 return REDIS_ERR;
3374 }
3375
bd79a6bd 3376 *target = value;
bbe025e0
AM
3377 return REDIS_OK;
3378}
3379
bd79a6bd
PN
3380static int getLongLongFromObject(robj *o, long long *target) {
3381 long long value;
682c73e8 3382 char *eptr;
bbe025e0 3383
bd79a6bd
PN
3384 if (o == NULL) {
3385 value = 0;
3386 } else {
3387 redisAssert(o->type == REDIS_STRING);
3388 if (o->encoding == REDIS_ENCODING_RAW) {
3389 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3390 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3391 } else if (o->encoding == REDIS_ENCODING_INT) {
3392 value = (long)o->ptr;
3393 } else {
946342c1 3394 redisPanic("Unknown string encoding");
bd79a6bd
PN
3395 }
3396 }
3397
d0b58d53 3398 if (target) *target = value;
bd79a6bd
PN
3399 return REDIS_OK;
3400}
bbe025e0 3401
bd79a6bd
PN
3402static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3403 long long value;
3404 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3405 if (msg != NULL) {
3406 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3407 } else {
3408 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3409 }
bbe025e0
AM
3410 return REDIS_ERR;
3411 }
3412
bd79a6bd 3413 *target = value;
bbe025e0
AM
3414 return REDIS_OK;
3415}
3416
bd79a6bd
PN
3417static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3418 long long value;
bbe025e0 3419
bd79a6bd
PN
3420 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3421 if (value < LONG_MIN || value > LONG_MAX) {
3422 if (msg != NULL) {
3423 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3424 } else {
3425 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3426 }
bbe025e0
AM
3427 return REDIS_ERR;
3428 }
3429
bd79a6bd 3430 *target = value;
bbe025e0
AM
3431 return REDIS_OK;
3432}
3433
612e4de8 3434/* =========================== Keyspace access API ========================== */
3435
3436static robj *lookupKey(redisDb *db, robj *key) {
09241813 3437 dictEntry *de = dictFind(db->dict,key->ptr);
612e4de8 3438 if (de) {
612e4de8 3439 robj *val = dictGetEntryVal(de);
3440
3441 if (server.vm_enabled) {
3442 if (val->storage == REDIS_VM_MEMORY ||
3443 val->storage == REDIS_VM_SWAPPING)
3444 {
3445 /* If we were swapping the object out, cancel the operation */
3446 if (val->storage == REDIS_VM_SWAPPING)
3447 vmCancelThreadedIOJob(val);
09241813 3448 /* Update the access time for the aging algorithm. */
612e4de8 3449 val->lru = server.lruclock;
3450 } else {
3451 int notify = (val->storage == REDIS_VM_LOADING);
3452
3453 /* Our value was swapped on disk. Bring it at home. */
3454 redisAssert(val->type == REDIS_VMPOINTER);
3455 val = vmLoadObject(val);
3456 dictGetEntryVal(de) = val;
3457
3458 /* Clients blocked by the VM subsystem may be waiting for
3459 * this key... */
3460 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3461 }
3462 }
3463 return val;
3464 } else {
3465 return NULL;
3466 }
3467}
3468
3469static robj *lookupKeyRead(redisDb *db, robj *key) {
3470 expireIfNeeded(db,key);
3471 return lookupKey(db,key);
3472}
3473
3474static robj *lookupKeyWrite(redisDb *db, robj *key) {
3475 deleteIfVolatile(db,key);
3476 touchWatchedKey(db,key);
3477 return lookupKey(db,key);
3478}
3479
3480static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3481 robj *o = lookupKeyRead(c->db, key);
3482 if (!o) addReply(c,reply);
3483 return o;
3484}
3485
3486static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3487 robj *o = lookupKeyWrite(c->db, key);
3488 if (!o) addReply(c,reply);
3489 return o;
3490}
3491
09241813 3492/* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3493 * otherwise REDIS_OK is returned, and the caller should increment the
3494 * refcount of 'val'. */
3495static int dbAdd(redisDb *db, robj *key, robj *val) {
3496 /* Perform a lookup before adding the key, as we need to copy the
3497 * key value. */
3498 if (dictFind(db->dict, key->ptr) != NULL) {
3499 return REDIS_ERR;
3500 } else {
3501 sds copy = sdsdup(key->ptr);
3502 dictAdd(db->dict, copy, val);
3503 return REDIS_OK;
3504 }
3505}
3506
3507/* If the key does not exist, this is just like dbAdd(). Otherwise
3508 * the value associated to the key is replaced with the new one.
3509 *
3510 * On update (key already existed) 0 is returned. Otherwise 1. */
3511static int dbReplace(redisDb *db, robj *key, robj *val) {
3512 if (dictFind(db->dict,key->ptr) == NULL) {
3513 sds copy = sdsdup(key->ptr);
3514 dictAdd(db->dict, copy, val);
3515 return 1;
3516 } else {
3517 dictReplace(db->dict, key->ptr, val);
3518 return 0;
3519 }
3520}
3521
3522static int dbExists(redisDb *db, robj *key) {
3523 return dictFind(db->dict,key->ptr) != NULL;
3524}
3525
3526/* Return a random key, in form of a Redis object.
3527 * If there are no keys, NULL is returned.
3528 *
3529 * The function makes sure to return keys not already expired. */
3530static robj *dbRandomKey(redisDb *db) {
3531 struct dictEntry *de;
3532
3533 while(1) {
3534 sds key;
3535 robj *keyobj;
3536
3537 de = dictGetRandomKey(db->dict);
3538 if (de == NULL) return NULL;
3539
3540 key = dictGetEntryKey(de);
3541 keyobj = createStringObject(key,sdslen(key));
3542 if (dictFind(db->expires,key)) {
3543 if (expireIfNeeded(db,keyobj)) {
3544 decrRefCount(keyobj);
3545 continue; /* search for another key. This expired. */
3546 }
3547 }
3548 return keyobj;
3549 }
3550}
3551
3552/* Delete a key, value, and associated expiration entry if any, from the DB */
3553static int dbDelete(redisDb *db, robj *key) {
612e4de8 3554 int retval;
3555
09241813 3556 if (dictSize(db->expires)) dictDelete(db->expires,key->ptr);
3557 retval = dictDelete(db->dict,key->ptr);
612e4de8 3558
3559 return retval == DICT_OK;
3560}
3561
06233c45 3562/*============================ RDB saving/loading =========================== */
ed9b544e 3563
f78fd11b 3564static int rdbSaveType(FILE *fp, unsigned char type) {
3565 if (fwrite(&type,1,1,fp) == 0) return -1;
3566 return 0;
3567}
3568
bb32ede5 3569static int rdbSaveTime(FILE *fp, time_t t) {
3570 int32_t t32 = (int32_t) t;
3571 if (fwrite(&t32,4,1,fp) == 0) return -1;
3572 return 0;
3573}
3574
e3566d4b 3575/* check rdbLoadLen() comments for more info */
f78fd11b 3576static int rdbSaveLen(FILE *fp, uint32_t len) {
3577 unsigned char buf[2];
3578
3579 if (len < (1<<6)) {
3580 /* Save a 6 bit len */
10c43610 3581 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3582 if (fwrite(buf,1,1,fp) == 0) return -1;
3583 } else if (len < (1<<14)) {
3584 /* Save a 14 bit len */
10c43610 3585 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3586 buf[1] = len&0xFF;
17be1a4a 3587 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3588 } else {
3589 /* Save a 32 bit len */
10c43610 3590 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3591 if (fwrite(buf,1,1,fp) == 0) return -1;
3592 len = htonl(len);
3593 if (fwrite(&len,4,1,fp) == 0) return -1;
3594 }
3595 return 0;
3596}
3597
32a66513 3598/* Encode 'value' as an integer if possible (if integer will fit the
3599 * supported range). If the function sucessful encoded the integer
3600 * then the (up to 5 bytes) encoded representation is written in the
3601 * string pointed by 'enc' and the length is returned. Otherwise
3602 * 0 is returned. */
3603static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3604 /* Finally check if it fits in our ranges */
3605 if (value >= -(1<<7) && value <= (1<<7)-1) {
3606 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3607 enc[1] = value&0xFF;
3608 return 2;
3609 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3610 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3611 enc[1] = value&0xFF;
3612 enc[2] = (value>>8)&0xFF;
3613 return 3;
3614 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3615 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3616 enc[1] = value&0xFF;
3617 enc[2] = (value>>8)&0xFF;
3618 enc[3] = (value>>16)&0xFF;
3619 enc[4] = (value>>24)&0xFF;
3620 return 5;
3621 } else {
3622 return 0;
3623 }
3624}
3625
32a66513 3626/* String objects in the form "2391" "-100" without any space and with a
3627 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3628 * encoded as integers to save space */
3629static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3630 long long value;
3631 char *endptr, buf[32];
3632
3633 /* Check if it's possible to encode this value as a number */
3634 value = strtoll(s, &endptr, 10);
3635 if (endptr[0] != '\0') return 0;
3636 ll2string(buf,32,value);
3637
3638 /* If the number converted back into a string is not identical
3639 * then it's not possible to encode the string as integer */
3640 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3641
3642 return rdbEncodeInteger(value,enc);
3643}
3644
b1befe6a 3645static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3646 size_t comprlen, outlen;
774e3047 3647 unsigned char byte;
3648 void *out;
3649
3650 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3651 if (len <= 4) return 0;
3652 outlen = len-4;
3a2694c4 3653 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3654 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3655 if (comprlen == 0) {
88e85998 3656 zfree(out);
774e3047 3657 return 0;
3658 }
3659 /* Data compressed! Let's save it on disk */
3660 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3661 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3662 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3663 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3664 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3665 zfree(out);
774e3047 3666 return comprlen;
3667
3668writeerr:
88e85998 3669 zfree(out);
774e3047 3670 return -1;
3671}
3672
e3566d4b 3673/* Save a string objet as [len][data] on disk. If the object is a string
3674 * representation of an integer value we try to safe it in a special form */
b1befe6a 3675static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3676 int enclen;
10c43610 3677
774e3047 3678 /* Try integer encoding */
e3566d4b 3679 if (len <= 11) {
3680 unsigned char buf[5];
b1befe6a 3681 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3682 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3683 return 0;
3684 }
3685 }
774e3047 3686
3687 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3688 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3689 if (server.rdbcompression && len > 20) {
774e3047 3690 int retval;
3691
b1befe6a 3692 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3693 if (retval == -1) return -1;
3694 if (retval > 0) return 0;
3695 /* retval == 0 means data can't be compressed, save the old way */
3696 }
3697
3698 /* Store verbatim */
10c43610 3699 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3700 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3701 return 0;
3702}
3703
2796f6da
PN
3704/* Save a long long value as either an encoded string or a string. */
3705static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3706 unsigned char buf[32];
3707 int enclen = rdbEncodeInteger(value,buf);
3708 if (enclen > 0) {
3709 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3710 } else {
3711 /* Encode as string */
3712 enclen = ll2string((char*)buf,32,value);
3713 redisAssert(enclen < 32);
3714 if (rdbSaveLen(fp,enclen) == -1) return -1;
3715 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3716 }
3717 return 0;
3718}
3719
942a3961 3720/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3721static int rdbSaveStringObject(FILE *fp, robj *obj) {
32a66513 3722 /* Avoid to decode the object, then encode it again, if the
3723 * object is alrady integer encoded. */
3724 if (obj->encoding == REDIS_ENCODING_INT) {
2796f6da 3725 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
996cb5f7 3726 } else {
2796f6da
PN
3727 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3728 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3729 }
942a3961 3730}
3731
a7866db6 3732/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3733 * 8 bit integer specifing the length of the representation.
3734 * This 8 bit integer has special values in order to specify the following
3735 * conditions:
3736 * 253: not a number
3737 * 254: + inf
3738 * 255: - inf
3739 */
3740static int rdbSaveDoubleValue(FILE *fp, double val) {
3741 unsigned char buf[128];
3742 int len;
3743
3744 if (isnan(val)) {
3745 buf[0] = 253;
3746 len = 1;
3747 } else if (!isfinite(val)) {
3748 len = 1;
3749 buf[0] = (val < 0) ? 255 : 254;
3750 } else {
88e8d89f 3751#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3752 /* Check if the float is in a safe range to be casted into a
3753 * long long. We are assuming that long long is 64 bit here.
3754 * Also we are assuming that there are no implementations around where
3755 * double has precision < 52 bit.
3756 *
3757 * Under this assumptions we test if a double is inside an interval
3758 * where casting to long long is safe. Then using two castings we
3759 * make sure the decimal part is zero. If all this is true we use
3760 * integer printing function that is much faster. */
fb82e75c 3761 double min = -4503599627370495; /* (2^52)-1 */
3762 double max = 4503599627370496; /* -(2^52) */
fe244589 3763 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3764 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3765 else
88e8d89f 3766#endif
8c096b16 3767 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3768 buf[0] = strlen((char*)buf+1);
a7866db6 3769 len = buf[0]+1;
3770 }
3771 if (fwrite(buf,len,1,fp) == 0) return -1;
3772 return 0;
3773}
3774
06233c45 3775/* Save a Redis object. */
3776static int rdbSaveObject(FILE *fp, robj *o) {
3777 if (o->type == REDIS_STRING) {
3778 /* Save a string value */
3779 if (rdbSaveStringObject(fp,o) == -1) return -1;
3780 } else if (o->type == REDIS_LIST) {
3781 /* Save a list value */
23f96494
PN
3782 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3783 unsigned char *p;
3784 unsigned char *vstr;
3785 unsigned int vlen;
3786 long long vlong;
3787
3788 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3789 p = ziplistIndex(o->ptr,0);
3790 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3791 if (vstr) {
3792 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3793 return -1;
3794 } else {
3795 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3796 return -1;
3797 }
3798 p = ziplistNext(o->ptr,p);
3799 }
3800 } else if (o->encoding == REDIS_ENCODING_LIST) {
3801 list *list = o->ptr;
3802 listIter li;
3803 listNode *ln;
3804
3805 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3806 listRewind(list,&li);
3807 while((ln = listNext(&li))) {
3808 robj *eleobj = listNodeValue(ln);
3809 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3810 }
3811 } else {
3812 redisPanic("Unknown list encoding");
06233c45 3813 }
3814 } else if (o->type == REDIS_SET) {
3815 /* Save a set value */
d0b58d53
PN
3816 if (o->encoding == REDIS_ENCODING_HT) {
3817 dict *set = o->ptr;
3818 dictIterator *di = dictGetIterator(set);
3819 dictEntry *de;
06233c45 3820
d0b58d53
PN
3821 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3822 while((de = dictNext(di)) != NULL) {
3823 robj *eleobj = dictGetEntryKey(de);
3824 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3825 }
3826 dictReleaseIterator(di);
3827 } else if (o->encoding == REDIS_ENCODING_INTSET) {
3828 intset *is = o->ptr;
3829 long long llval;
3830 int i = 0;
3831
3832 if (rdbSaveLen(fp,intsetLen(is)) == -1) return -1;
3833 while(intsetGet(is,i++,&llval)) {
3834 if (rdbSaveLongLongAsStringObject(fp,llval) == -1) return -1;
3835 }
3836 } else {
3837 redisPanic("Unknown set encoding");
06233c45 3838 }
06233c45 3839 } else if (o->type == REDIS_ZSET) {
3840 /* Save a set value */
3841 zset *zs = o->ptr;
3842 dictIterator *di = dictGetIterator(zs->dict);
3843 dictEntry *de;
3844
3845 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3846 while((de = dictNext(di)) != NULL) {
3847 robj *eleobj = dictGetEntryKey(de);
3848 double *score = dictGetEntryVal(de);
3849
3850 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3851 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3852 }
3853 dictReleaseIterator(di);
b1befe6a 3854 } else if (o->type == REDIS_HASH) {
3855 /* Save a hash value */
3856 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3857 unsigned char *p = zipmapRewind(o->ptr);
3858 unsigned int count = zipmapLen(o->ptr);
3859 unsigned char *key, *val;
3860 unsigned int klen, vlen;
3861
3862 if (rdbSaveLen(fp,count) == -1) return -1;
3863 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3864 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3865 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3866 }
3867 } else {
3868 dictIterator *di = dictGetIterator(o->ptr);
3869 dictEntry *de;
3870
3871 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3872 while((de = dictNext(di)) != NULL) {
3873 robj *key = dictGetEntryKey(de);
3874 robj *val = dictGetEntryVal(de);
3875
3876 if (rdbSaveStringObject(fp,key) == -1) return -1;
3877 if (rdbSaveStringObject(fp,val) == -1) return -1;
3878 }
3879 dictReleaseIterator(di);
3880 }
06233c45 3881 } else {
f83c6cb5 3882 redisPanic("Unknown object type");
06233c45 3883 }
3884 return 0;
3885}
3886
3887/* Return the length the object will have on disk if saved with
3888 * the rdbSaveObject() function. Currently we use a trick to get
3889 * this length with very little changes to the code. In the future
3890 * we could switch to a faster solution. */
b9bc0eef 3891static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3892 if (fp == NULL) fp = server.devnull;
06233c45 3893 rewind(fp);
3894 assert(rdbSaveObject(fp,o) != 1);
3895 return ftello(fp);
3896}
3897
06224fec 3898/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3899static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3900 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3901
06224fec 3902 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3903}
3904
ed9b544e 3905/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3906static int rdbSave(char *filename) {
ed9b544e 3907 dictIterator *di = NULL;
3908 dictEntry *de;
ed9b544e 3909 FILE *fp;
3910 char tmpfile[256];
3911 int j;
bb32ede5 3912 time_t now = time(NULL);
ed9b544e 3913
2316bb3b 3914 /* Wait for I/O therads to terminate, just in case this is a
3915 * foreground-saving, to avoid seeking the swap file descriptor at the
3916 * same time. */
3917 if (server.vm_enabled)
3918 waitEmptyIOJobsQueue();
3919
a3b21203 3920 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3921 fp = fopen(tmpfile,"w");
3922 if (!fp) {
3923 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3924 return REDIS_ERR;
3925 }
f78fd11b 3926 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3927 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3928 redisDb *db = server.db+j;
3929 dict *d = db->dict;
3305306f 3930 if (dictSize(d) == 0) continue;
ed9b544e 3931 di = dictGetIterator(d);
3932 if (!di) {
3933 fclose(fp);
3934 return REDIS_ERR;
3935 }
3936
3937 /* Write the SELECT DB opcode */
f78fd11b 3938 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3939 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3940
3941 /* Iterate this DB writing every entry */
3942 while((de = dictNext(di)) != NULL) {
09241813 3943 sds keystr = dictGetEntryKey(de);
3944 robj key, *o = dictGetEntryVal(de);
3945 time_t expiretime;
3946
3947 initStaticStringObject(key,keystr);
3948 expiretime = getExpire(db,&key);
bb32ede5 3949
3950 /* Save the expire time */
3951 if (expiretime != -1) {
3952 /* If this key is already expired skip it */
3953 if (expiretime < now) continue;
3954 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3955 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3956 }
7e69548d 3957 /* Save the key and associated value. This requires special
3958 * handling if the value is swapped out. */
560db612 3959 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3960 o->storage == REDIS_VM_SWAPPING) {
7e69548d 3961 /* Save type, key, value */
3962 if (rdbSaveType(fp,o->type) == -1) goto werr;
09241813 3963 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
7e69548d 3964 if (rdbSaveObject(fp,o) == -1) goto werr;
3965 } else {
996cb5f7 3966 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3967 robj *po;
7e69548d 3968 /* Get a preview of the object in memory */
560db612 3969 po = vmPreviewObject(o);
7e69548d 3970 /* Save type, key, value */
560db612 3971 if (rdbSaveType(fp,po->type) == -1) goto werr;
09241813 3972 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
7e69548d 3973 if (rdbSaveObject(fp,po) == -1) goto werr;
3974 /* Remove the loaded object from memory */
3975 decrRefCount(po);
7e69548d 3976 }
ed9b544e 3977 }
3978 dictReleaseIterator(di);
3979 }
3980 /* EOF opcode */
f78fd11b 3981 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3982
3983 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3984 fflush(fp);
3985 fsync(fileno(fp));
3986 fclose(fp);
e0a62c7f 3987
ed9b544e 3988 /* Use RENAME to make sure the DB file is changed atomically only
3989 * if the generate DB file is ok. */
3990 if (rename(tmpfile,filename) == -1) {
325d1eb4 3991 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3992 unlink(tmpfile);
3993 return REDIS_ERR;
3994 }
3995 redisLog(REDIS_NOTICE,"DB saved on disk");
3996 server.dirty = 0;
3997 server.lastsave = time(NULL);
3998 return REDIS_OK;
3999
4000werr:
4001 fclose(fp);
4002 unlink(tmpfile);
4003 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
4004 if (di) dictReleaseIterator(di);
4005 return REDIS_ERR;
4006}
4007
f78fd11b 4008static int rdbSaveBackground(char *filename) {
ed9b544e 4009 pid_t childpid;
4010
9d65a1bb 4011 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 4012 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 4013 if ((childpid = fork()) == 0) {
4014 /* Child */
054e426d 4015 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 4016 close(server.fd);
f78fd11b 4017 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 4018 _exit(0);
ed9b544e 4019 } else {
478c2c6f 4020 _exit(1);
ed9b544e 4021 }
4022 } else {
4023 /* Parent */
5a7c647e 4024 if (childpid == -1) {
4025 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
4026 strerror(errno));
4027 return REDIS_ERR;
4028 }
ed9b544e 4029 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 4030 server.bgsavechildpid = childpid;
884d4b39 4031 updateDictResizePolicy();
ed9b544e 4032 return REDIS_OK;
4033 }
4034 return REDIS_OK; /* unreached */
4035}
4036
a3b21203 4037static void rdbRemoveTempFile(pid_t childpid) {
4038 char tmpfile[256];
4039
4040 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4041 unlink(tmpfile);
4042}
4043
f78fd11b 4044static int rdbLoadType(FILE *fp) {
4045 unsigned char type;
7b45bfb2 4046 if (fread(&type,1,1,fp) == 0) return -1;
4047 return type;
4048}
4049
bb32ede5 4050static time_t rdbLoadTime(FILE *fp) {
4051 int32_t t32;
4052 if (fread(&t32,4,1,fp) == 0) return -1;
4053 return (time_t) t32;
4054}
4055
e3566d4b 4056/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4057 * of this file for a description of how this are stored on disk.
4058 *
4059 * isencoded is set to 1 if the readed length is not actually a length but
4060 * an "encoding type", check the above comments for more info */
c78a8ccc 4061static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 4062 unsigned char buf[2];
4063 uint32_t len;
c78a8ccc 4064 int type;
f78fd11b 4065
e3566d4b 4066 if (isencoded) *isencoded = 0;
c78a8ccc 4067 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4068 type = (buf[0]&0xC0)>>6;
4069 if (type == REDIS_RDB_6BITLEN) {
4070 /* Read a 6 bit len */
4071 return buf[0]&0x3F;
4072 } else if (type == REDIS_RDB_ENCVAL) {
4073 /* Read a 6 bit len encoding type */
4074 if (isencoded) *isencoded = 1;
4075 return buf[0]&0x3F;
4076 } else if (type == REDIS_RDB_14BITLEN) {
4077 /* Read a 14 bit len */
4078 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4079 return ((buf[0]&0x3F)<<8)|buf[1];
4080 } else {
4081 /* Read a 32 bit len */
f78fd11b 4082 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4083 return ntohl(len);
f78fd11b 4084 }
f78fd11b 4085}
4086
ad30aa60 4087/* Load an integer-encoded object from file 'fp', with the specified
4088 * encoding type 'enctype'. If encode is true the function may return
4089 * an integer-encoded object as reply, otherwise the returned object
4090 * will always be encoded as a raw string. */
4091static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 4092 unsigned char enc[4];
4093 long long val;
4094
4095 if (enctype == REDIS_RDB_ENC_INT8) {
4096 if (fread(enc,1,1,fp) == 0) return NULL;
4097 val = (signed char)enc[0];
4098 } else if (enctype == REDIS_RDB_ENC_INT16) {
4099 uint16_t v;
4100 if (fread(enc,2,1,fp) == 0) return NULL;
4101 v = enc[0]|(enc[1]<<8);
4102 val = (int16_t)v;
4103 } else if (enctype == REDIS_RDB_ENC_INT32) {
4104 uint32_t v;
4105 if (fread(enc,4,1,fp) == 0) return NULL;
4106 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4107 val = (int32_t)v;
4108 } else {
4109 val = 0; /* anti-warning */
f83c6cb5 4110 redisPanic("Unknown RDB integer encoding type");
e3566d4b 4111 }
ad30aa60 4112 if (encode)
4113 return createStringObjectFromLongLong(val);
4114 else
4115 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 4116}
4117
c78a8ccc 4118static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 4119 unsigned int len, clen;
4120 unsigned char *c = NULL;
4121 sds val = NULL;
4122
c78a8ccc 4123 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4124 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 4125 if ((c = zmalloc(clen)) == NULL) goto err;
4126 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4127 if (fread(c,clen,1,fp) == 0) goto err;
4128 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 4129 zfree(c);
88e85998 4130 return createObject(REDIS_STRING,val);
4131err:
4132 zfree(c);
4133 sdsfree(val);
4134 return NULL;
4135}
4136
ad30aa60 4137static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 4138 int isencoded;
4139 uint32_t len;
f78fd11b 4140 sds val;
4141
c78a8ccc 4142 len = rdbLoadLen(fp,&isencoded);
e3566d4b 4143 if (isencoded) {
4144 switch(len) {
4145 case REDIS_RDB_ENC_INT8:
4146 case REDIS_RDB_ENC_INT16:
4147 case REDIS_RDB_ENC_INT32:
ad30aa60 4148 return rdbLoadIntegerObject(fp,len,encode);
88e85998 4149 case REDIS_RDB_ENC_LZF:
bdcb92f2 4150 return rdbLoadLzfStringObject(fp);
e3566d4b 4151 default:
f83c6cb5 4152 redisPanic("Unknown RDB encoding type");
e3566d4b 4153 }
4154 }
4155
f78fd11b 4156 if (len == REDIS_RDB_LENERR) return NULL;
4157 val = sdsnewlen(NULL,len);
4158 if (len && fread(val,len,1,fp) == 0) {
4159 sdsfree(val);
4160 return NULL;
4161 }
bdcb92f2 4162 return createObject(REDIS_STRING,val);
f78fd11b 4163}
4164
ad30aa60 4165static robj *rdbLoadStringObject(FILE *fp) {
4166 return rdbGenericLoadStringObject(fp,0);
4167}
4168
4169static robj *rdbLoadEncodedStringObject(FILE *fp) {
4170 return rdbGenericLoadStringObject(fp,1);
4171}
4172
a7866db6 4173/* For information about double serialization check rdbSaveDoubleValue() */
4174static int rdbLoadDoubleValue(FILE *fp, double *val) {
4175 char buf[128];
4176 unsigned char len;
4177
4178 if (fread(&len,1,1,fp) == 0) return -1;
4179 switch(len) {
4180 case 255: *val = R_NegInf; return 0;
4181 case 254: *val = R_PosInf; return 0;
4182 case 253: *val = R_Nan; return 0;
4183 default:
4184 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 4185 buf[len] = '\0';
a7866db6 4186 sscanf(buf, "%lg", val);
4187 return 0;
4188 }
4189}
4190
c78a8ccc 4191/* Load a Redis object of the specified type from the specified file.
4192 * On success a newly allocated object is returned, otherwise NULL. */
4193static robj *rdbLoadObject(int type, FILE *fp) {
23f96494
PN
4194 robj *o, *ele, *dec;
4195 size_t len;
400aea2b 4196 unsigned int i;
c78a8ccc 4197
bcd11906 4198 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 4199 if (type == REDIS_STRING) {
4200 /* Read string value */
ad30aa60 4201 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4202 o = tryObjectEncoding(o);
23f96494
PN
4203 } else if (type == REDIS_LIST) {
4204 /* Read list value */
4205 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4206
d0686e07
PN
4207 /* Use a real list when there are too many entries */
4208 if (len > server.list_max_ziplist_entries) {
4209 o = createListObject();
4210 } else {
4211 o = createZiplistObject();
4212 }
c78a8ccc 4213
23f96494
PN
4214 /* Load every single element of the list */
4215 while(len--) {
4216 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4217
d0686e07
PN
4218 /* If we are using a ziplist and the value is too big, convert
4219 * the object to a real list. */
4220 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4221 ele->encoding == REDIS_ENCODING_RAW &&
4222 sdslen(ele->ptr) > server.list_max_ziplist_value)
003f0840 4223 listTypeConvert(o,REDIS_ENCODING_LIST);
d0686e07 4224
23f96494
PN
4225 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4226 dec = getDecodedObject(ele);
4227 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4228 decrRefCount(dec);
4229 decrRefCount(ele);
4230 } else {
4231 ele = tryObjectEncoding(ele);
4232 listAddNodeTail(o->ptr,ele);
23f96494
PN
4233 }
4234 }
4235 } else if (type == REDIS_SET) {
4236 /* Read list/set value */
4237 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
273f6169
PN
4238
4239 /* Use a regular set when there are too many entries. */
4240 if (len > server.set_max_intset_entries) {
4241 o = createSetObject();
4242 /* It's faster to expand the dict to the right size asap in order
4243 * to avoid rehashing */
4244 if (len > DICT_HT_INITIAL_SIZE)
4245 dictExpand(o->ptr,len);
4246 } else {
4247 o = createIntsetObject();
4248 }
4249
c78a8ccc 4250 /* Load every single element of the list/set */
400aea2b 4251 for (i = 0; i < len; i++) {
273f6169 4252 long long llval;
ad30aa60 4253 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4254 ele = tryObjectEncoding(ele);
273f6169
PN
4255
4256 if (o->encoding == REDIS_ENCODING_INTSET) {
4257 /* Fetch integer value from element */
4258 if (getLongLongFromObject(ele,&llval) == REDIS_OK) {
4259 o->ptr = intsetAdd(o->ptr,llval,NULL);
4260 } else {
4261 setTypeConvert(o,REDIS_ENCODING_HT);
400aea2b 4262 dictExpand(o->ptr,len);
273f6169
PN
4263 }
4264 }
4265
4266 /* This will also be called when the set was just converted
4267 * to regular hashtable encoded set */
4268 if (o->encoding == REDIS_ENCODING_HT) {
4269 dictAdd((dict*)o->ptr,ele,NULL);
4270 }
c78a8ccc 4271 }
4272 } else if (type == REDIS_ZSET) {
4273 /* Read list/set value */
ada386b2 4274 size_t zsetlen;
c78a8ccc 4275 zset *zs;
4276
4277 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4278 o = createZsetObject();
4279 zs = o->ptr;
4280 /* Load every single element of the list/set */
4281 while(zsetlen--) {
4282 robj *ele;
4283 double *score = zmalloc(sizeof(double));
4284
ad30aa60 4285 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4286 ele = tryObjectEncoding(ele);
c78a8ccc 4287 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4288 dictAdd(zs->dict,ele,score);
4289 zslInsert(zs->zsl,*score,ele);
4290 incrRefCount(ele); /* added to skiplist */
4291 }
ada386b2 4292 } else if (type == REDIS_HASH) {
4293 size_t hashlen;
4294
4295 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4296 o = createHashObject();
4297 /* Too many entries? Use an hash table. */
4298 if (hashlen > server.hash_max_zipmap_entries)
4299 convertToRealHash(o);
4300 /* Load every key/value, then set it into the zipmap or hash
4301 * table, as needed. */
4302 while(hashlen--) {
4303 robj *key, *val;
4304
b785b2bf 4305 if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4306 if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
ada386b2 4307 /* If we are using a zipmap and there are too big values
4308 * the object is converted to real hash table encoding. */
4309 if (o->encoding != REDIS_ENCODING_HT &&
4310 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4311 sdslen(val->ptr) > server.hash_max_zipmap_value))
4312 {
4313 convertToRealHash(o);
4314 }
4315
4316 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4317 unsigned char *zm = o->ptr;
4318
4319 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4320 val->ptr,sdslen(val->ptr),NULL);
4321 o->ptr = zm;
4322 decrRefCount(key);
4323 decrRefCount(val);
4324 } else {
05df7621 4325 key = tryObjectEncoding(key);
4326 val = tryObjectEncoding(val);
ada386b2 4327 dictAdd((dict*)o->ptr,key,val);
ada386b2 4328 }
4329 }
c78a8ccc 4330 } else {
f83c6cb5 4331 redisPanic("Unknown object type");
c78a8ccc 4332 }
4333 return o;
4334}
4335
f78fd11b 4336static int rdbLoad(char *filename) {
ed9b544e 4337 FILE *fp;
f78fd11b 4338 uint32_t dbid;
bb32ede5 4339 int type, retval, rdbver;
585af7e2 4340 int swap_all_values = 0;
bb32ede5 4341 redisDb *db = server.db+0;
f78fd11b 4342 char buf[1024];
242a64f3 4343 time_t expiretime, now = time(NULL);
bb32ede5 4344
ed9b544e 4345 fp = fopen(filename,"r");
4346 if (!fp) return REDIS_ERR;
4347 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4348 buf[9] = '\0';
4349 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4350 fclose(fp);
4351 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4352 return REDIS_ERR;
4353 }
f78fd11b 4354 rdbver = atoi(buf+5);
c78a8ccc 4355 if (rdbver != 1) {
f78fd11b 4356 fclose(fp);
4357 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4358 return REDIS_ERR;
4359 }
ed9b544e 4360 while(1) {
585af7e2 4361 robj *key, *val;
7e02fe32 4362 int force_swapout;
ed9b544e 4363
585af7e2 4364 expiretime = -1;
ed9b544e 4365 /* Read type. */
f78fd11b 4366 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4367 if (type == REDIS_EXPIRETIME) {
4368 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4369 /* We read the time so we need to read the object type again */
4370 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4371 }
ed9b544e 4372 if (type == REDIS_EOF) break;
4373 /* Handle SELECT DB opcode as a special case */
4374 if (type == REDIS_SELECTDB) {
c78a8ccc 4375 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4376 goto eoferr;
ed9b544e 4377 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4378 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4379 exit(1);
4380 }
bb32ede5 4381 db = server.db+dbid;
ed9b544e 4382 continue;
4383 }
4384 /* Read key */
585af7e2 4385 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4386 /* Read value */
585af7e2 4387 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4388 /* Check if the key already expired */
4389 if (expiretime != -1 && expiretime < now) {
4390 decrRefCount(key);
4391 decrRefCount(val);
4392 continue;
4393 }
ed9b544e 4394 /* Add the new object in the hash table */
09241813 4395 retval = dbAdd(db,key,val);
4396 if (retval == REDIS_ERR) {
585af7e2 4397 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4398 exit(1);
4399 }
bb32ede5 4400 /* Set the expire time if needed */
89e689c5 4401 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4402
b492cf00 4403 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4404
4405 /* If we detecter we are hopeless about fitting something in memory
4406 * we just swap every new key on disk. Directly...
4407 * Note that's important to check for this condition before resorting
4408 * to random sampling, otherwise we may try to swap already
4409 * swapped keys. */
585af7e2 4410 if (swap_all_values) {
09241813 4411 dictEntry *de = dictFind(db->dict,key->ptr);
242a64f3 4412
4413 /* de may be NULL since the key already expired */
4414 if (de) {
560db612 4415 vmpointer *vp;
585af7e2 4416 val = dictGetEntryVal(de);
242a64f3 4417
560db612 4418 if (val->refcount == 1 &&
4419 (vp = vmSwapObjectBlocking(val)) != NULL)
4420 dictGetEntryVal(de) = vp;
242a64f3 4421 }
09241813 4422 decrRefCount(key);
242a64f3 4423 continue;
4424 }
09241813 4425 decrRefCount(key);
242a64f3 4426
a89b7013 4427 /* Flush data on disk once 32 MB of additional RAM are used... */
7e02fe32 4428 force_swapout = 0;
4429 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4430 force_swapout = 1;
242a64f3 4431
4432 /* If we have still some hope of having some value fitting memory
4433 * then we try random sampling. */
7e02fe32 4434 if (!swap_all_values && server.vm_enabled && force_swapout) {
b492cf00 4435 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4436 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4437 }
242a64f3 4438 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4439 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4440 }
ed9b544e 4441 }
4442 fclose(fp);
4443 return REDIS_OK;
4444
4445eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4446 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4447 exit(1);
4448 return REDIS_ERR; /* Just to avoid warning */
4449}
4450
b58ba105 4451/*================================== Shutdown =============================== */
fab43727 4452static int prepareForShutdown() {
b58ba105
AM
4453 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4454 /* Kill the saving child if there is a background saving in progress.
4455 We want to avoid race conditions, for instance our saving child may
4456 overwrite the synchronous saving did by SHUTDOWN. */
4457 if (server.bgsavechildpid != -1) {
4458 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4459 kill(server.bgsavechildpid,SIGKILL);
4460 rdbRemoveTempFile(server.bgsavechildpid);
4461 }
4462 if (server.appendonly) {
4463 /* Append only file: fsync() the AOF and exit */
b0bd87f6 4464 aof_fsync(server.appendfd);
b58ba105 4465 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4466 } else {
4467 /* Snapshotting. Perform a SYNC SAVE and exit */
4468 if (rdbSave(server.dbfilename) == REDIS_OK) {
4469 if (server.daemonize)
4470 unlink(server.pidfile);
4471 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4472 } else {
4473 /* Ooops.. error saving! The best we can do is to continue
4474 * operating. Note that if there was a background saving process,
4475 * in the next cron() Redis will be notified that the background
4476 * saving aborted, handling special stuff like slaves pending for
4477 * synchronization... */
4478 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4479 return REDIS_ERR;
b58ba105
AM
4480 }
4481 }
8513a757 4482 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4483 return REDIS_OK;
b58ba105
AM
4484}
4485
ed9b544e 4486/*================================== Commands =============================== */
4487
abcb223e 4488static void authCommand(redisClient *c) {
2e77c2ee 4489 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4490 c->authenticated = 1;
4491 addReply(c,shared.ok);
4492 } else {
4493 c->authenticated = 0;
fa4c0aba 4494 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4495 }
4496}
4497
ed9b544e 4498static void pingCommand(redisClient *c) {
4499 addReply(c,shared.pong);
4500}
4501
4502static void echoCommand(redisClient *c) {
dd88747b 4503 addReplyBulk(c,c->argv[1]);
ed9b544e 4504}
4505
4506/*=================================== Strings =============================== */
4507
526d00a5 4508static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4509 int retval;
10ce1276 4510 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4511
526d00a5 4512 if (expire) {
4513 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4514 return;
4515 if (seconds <= 0) {
4516 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4517 return;
4518 }
4519 }
4520
37ab76c9 4521 touchWatchedKey(c->db,key);
526d00a5 4522 if (nx) deleteIfVolatile(c->db,key);
09241813 4523 retval = dbAdd(c->db,key,val);
4524 if (retval == REDIS_ERR) {
ed9b544e 4525 if (!nx) {
09241813 4526 dbReplace(c->db,key,val);
526d00a5 4527 incrRefCount(val);
ed9b544e 4528 } else {
c937aa89 4529 addReply(c,shared.czero);
ed9b544e 4530 return;
4531 }
4532 } else {
526d00a5 4533 incrRefCount(val);
ed9b544e 4534 }
4535 server.dirty++;
526d00a5 4536 removeExpire(c->db,key);
4537 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4538 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4539}
4540
4541static void setCommand(redisClient *c) {
526d00a5 4542 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4543}
4544
4545static void setnxCommand(redisClient *c) {
526d00a5 4546 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4547}
4548
4549static void setexCommand(redisClient *c) {
4550 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4551}
4552
322fc7d8 4553static int getGenericCommand(redisClient *c) {
dd88747b 4554 robj *o;
e0a62c7f 4555
dd88747b 4556 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4557 return REDIS_OK;
dd88747b 4558
4559 if (o->type != REDIS_STRING) {
4560 addReply(c,shared.wrongtypeerr);
4561 return REDIS_ERR;
ed9b544e 4562 } else {
dd88747b 4563 addReplyBulk(c,o);
4564 return REDIS_OK;
ed9b544e 4565 }
4566}
4567
322fc7d8 4568static void getCommand(redisClient *c) {
4569 getGenericCommand(c);
4570}
4571
f6b141c5 4572static void getsetCommand(redisClient *c) {
322fc7d8 4573 if (getGenericCommand(c) == REDIS_ERR) return;
09241813 4574 dbReplace(c->db,c->argv[1],c->argv[2]);
a431eb74 4575 incrRefCount(c->argv[2]);
4576 server.dirty++;
4577 removeExpire(c->db,c->argv[1]);
4578}
4579
70003d28 4580static void mgetCommand(redisClient *c) {
70003d28 4581 int j;
e0a62c7f 4582
c937aa89 4583 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4584 for (j = 1; j < c->argc; j++) {
3305306f 4585 robj *o = lookupKeyRead(c->db,c->argv[j]);
4586 if (o == NULL) {
c937aa89 4587 addReply(c,shared.nullbulk);
70003d28 4588 } else {
70003d28 4589 if (o->type != REDIS_STRING) {
c937aa89 4590 addReply(c,shared.nullbulk);
70003d28 4591 } else {
dd88747b 4592 addReplyBulk(c,o);
70003d28 4593 }
4594 }
4595 }
4596}
4597
6c446631 4598static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4599 int j, busykeys = 0;
6c446631 4600
4601 if ((c->argc % 2) == 0) {
454d4e43 4602 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4603 return;
4604 }
4605 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4606 * set nothing at all if at least one already key exists. */
4607 if (nx) {
4608 for (j = 1; j < c->argc; j += 2) {
906573e7 4609 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4610 busykeys++;
6c446631 4611 }
4612 }
4613 }
906573e7 4614 if (busykeys) {
4615 addReply(c, shared.czero);
4616 return;
4617 }
6c446631 4618
4619 for (j = 1; j < c->argc; j += 2) {
05df7621 4620 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
09241813 4621 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4622 incrRefCount(c->argv[j+1]);
6c446631 4623 removeExpire(c->db,c->argv[j]);
4624 }
4625 server.dirty += (c->argc-1)/2;
4626 addReply(c, nx ? shared.cone : shared.ok);
4627}
4628
4629static void msetCommand(redisClient *c) {
4630 msetGenericCommand(c,0);
4631}
4632
4633static void msetnxCommand(redisClient *c) {
4634 msetGenericCommand(c,1);
4635}
4636
d68ed120 4637static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4638 long long value;
ed9b544e 4639 robj *o;
e0a62c7f 4640
3305306f 4641 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4642 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4643 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4644
4645 value += incr;
d6f4c262 4646 o = createStringObjectFromLongLong(value);
09241813 4647 dbReplace(c->db,c->argv[1],o);
ed9b544e 4648 server.dirty++;
c937aa89 4649 addReply(c,shared.colon);
ed9b544e 4650 addReply(c,o);
4651 addReply(c,shared.crlf);
4652}
4653
4654static void incrCommand(redisClient *c) {
a4d1ba9a 4655 incrDecrCommand(c,1);
ed9b544e 4656}
4657
4658static void decrCommand(redisClient *c) {
a4d1ba9a 4659 incrDecrCommand(c,-1);
ed9b544e 4660}
4661
4662static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4663 long long incr;
4664
bd79a6bd 4665 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4666 incrDecrCommand(c,incr);
ed9b544e 4667}
4668
4669static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4670 long long incr;
4671
bd79a6bd 4672 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4673 incrDecrCommand(c,-incr);
ed9b544e 4674}
4675
4b00bebd 4676static void appendCommand(redisClient *c) {
4677 int retval;
4678 size_t totlen;
4679 robj *o;
4680
4681 o = lookupKeyWrite(c->db,c->argv[1]);
4682 if (o == NULL) {
4683 /* Create the key */
09241813 4684 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4b00bebd 4685 incrRefCount(c->argv[2]);
4686 totlen = stringObjectLen(c->argv[2]);
4687 } else {
4b00bebd 4688 if (o->type != REDIS_STRING) {
4689 addReply(c,shared.wrongtypeerr);
4690 return;
4691 }
4692 /* If the object is specially encoded or shared we have to make
4693 * a copy */
4694 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4695 robj *decoded = getDecodedObject(o);
4696
4697 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4698 decrRefCount(decoded);
09241813 4699 dbReplace(c->db,c->argv[1],o);
4b00bebd 4700 }
4701 /* APPEND! */
4702 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4703 o->ptr = sdscatlen(o->ptr,
4704 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4705 } else {
4706 o->ptr = sdscatprintf(o->ptr, "%ld",
4707 (unsigned long) c->argv[2]->ptr);
4708 }
4709 totlen = sdslen(o->ptr);
4710 }
4711 server.dirty++;
4712 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4713}
4714
39191553 4715static void substrCommand(redisClient *c) {
4716 robj *o;
4717 long start = atoi(c->argv[2]->ptr);
4718 long end = atoi(c->argv[3]->ptr);
dd88747b 4719 size_t rangelen, strlen;
4720 sds range;
39191553 4721
dd88747b 4722 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4723 checkType(c,o,REDIS_STRING)) return;
39191553 4724
dd88747b 4725 o = getDecodedObject(o);
4726 strlen = sdslen(o->ptr);
8fe7fad7 4727
dd88747b 4728 /* convert negative indexes */
4729 if (start < 0) start = strlen+start;
4730 if (end < 0) end = strlen+end;
4731 if (start < 0) start = 0;
4732 if (end < 0) end = 0;
39191553 4733
dd88747b 4734 /* indexes sanity checks */
4735 if (start > end || (size_t)start >= strlen) {
4736 /* Out of range start or start > end result in null reply */
4737 addReply(c,shared.nullbulk);
4738 decrRefCount(o);
4739 return;
39191553 4740 }
dd88747b 4741 if ((size_t)end >= strlen) end = strlen-1;
4742 rangelen = (end-start)+1;
4743
4744 /* Return the result */
4745 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4746 range = sdsnewlen((char*)o->ptr+start,rangelen);
4747 addReplySds(c,range);
4748 addReply(c,shared.crlf);
4749 decrRefCount(o);
39191553 4750}
4751
ed9b544e 4752/* ========================= Type agnostic commands ========================= */
4753
4754static void delCommand(redisClient *c) {
5109cdff 4755 int deleted = 0, j;
4756
4757 for (j = 1; j < c->argc; j++) {
09241813 4758 if (dbDelete(c->db,c->argv[j])) {
37ab76c9 4759 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4760 server.dirty++;
4761 deleted++;
4762 }
4763 }
482b672d 4764 addReplyLongLong(c,deleted);
ed9b544e 4765}
4766
4767static void existsCommand(redisClient *c) {
f4f06efc 4768 expireIfNeeded(c->db,c->argv[1]);
09241813 4769 if (dbExists(c->db,c->argv[1])) {
f4f06efc
PN
4770 addReply(c, shared.cone);
4771 } else {
4772 addReply(c, shared.czero);
4773 }
ed9b544e 4774}
4775
4776static void selectCommand(redisClient *c) {
4777 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4778
ed9b544e 4779 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4780 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4781 } else {
4782 addReply(c,shared.ok);
4783 }
4784}
4785
4786static void randomkeyCommand(redisClient *c) {
dc4be23e 4787 robj *key;
e0a62c7f 4788
09241813 4789 if ((key = dbRandomKey(c->db)) == NULL) {
dc4be23e 4790 addReply(c,shared.nullbulk);
4791 return;
4792 }
4793
09241813 4794 addReplyBulk(c,key);
4795 decrRefCount(key);
ed9b544e 4796}
4797
4798static void keysCommand(redisClient *c) {
4799 dictIterator *di;
4800 dictEntry *de;
4801 sds pattern = c->argv[1]->ptr;
4802 int plen = sdslen(pattern);
a3f9eec2 4803 unsigned long numkeys = 0;
ed9b544e 4804 robj *lenobj = createObject(REDIS_STRING,NULL);
4805
3305306f 4806 di = dictGetIterator(c->db->dict);
ed9b544e 4807 addReply(c,lenobj);
4808 decrRefCount(lenobj);
4809 while((de = dictNext(di)) != NULL) {
09241813 4810 sds key = dictGetEntryKey(de);
4811 robj *keyobj;
3305306f 4812
ed9b544e 4813 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4814 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
09241813 4815 keyobj = createStringObject(key,sdslen(key));
3305306f 4816 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4817 addReplyBulk(c,keyobj);
3305306f 4818 numkeys++;
3305306f 4819 }
09241813 4820 decrRefCount(keyobj);
ed9b544e 4821 }
4822 }
4823 dictReleaseIterator(di);
a3f9eec2 4824 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4825}
4826
4827static void dbsizeCommand(redisClient *c) {
4828 addReplySds(c,
3305306f 4829 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4830}
4831
4832static void lastsaveCommand(redisClient *c) {
4833 addReplySds(c,
c937aa89 4834 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4835}
4836
4837static void typeCommand(redisClient *c) {
3305306f 4838 robj *o;
ed9b544e 4839 char *type;
3305306f 4840
4841 o = lookupKeyRead(c->db,c->argv[1]);
4842 if (o == NULL) {
c937aa89 4843 type = "+none";
ed9b544e 4844 } else {
ed9b544e 4845 switch(o->type) {
c937aa89 4846 case REDIS_STRING: type = "+string"; break;
4847 case REDIS_LIST: type = "+list"; break;
4848 case REDIS_SET: type = "+set"; break;
412a8bce 4849 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4850 case REDIS_HASH: type = "+hash"; break;
4851 default: type = "+unknown"; break;
ed9b544e 4852 }
4853 }
4854 addReplySds(c,sdsnew(type));
4855 addReply(c,shared.crlf);
4856}
4857
4858static void saveCommand(redisClient *c) {
9d65a1bb 4859 if (server.bgsavechildpid != -1) {
05557f6d 4860 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4861 return;
4862 }
f78fd11b 4863 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4864 addReply(c,shared.ok);
4865 } else {
4866 addReply(c,shared.err);
4867 }
4868}
4869
4870static void bgsaveCommand(redisClient *c) {
9d65a1bb 4871 if (server.bgsavechildpid != -1) {
ed9b544e 4872 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4873 return;
4874 }
f78fd11b 4875 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4876 char *status = "+Background saving started\r\n";
4877 addReplySds(c,sdsnew(status));
ed9b544e 4878 } else {
4879 addReply(c,shared.err);
4880 }
4881}
4882
4883static void shutdownCommand(redisClient *c) {
fab43727 4884 if (prepareForShutdown() == REDIS_OK)
4885 exit(0);
4886 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4887}
4888
4889static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4890 robj *o;
4891
4892 /* To use the same key as src and dst is probably an error */
4893 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4894 addReply(c,shared.sameobjecterr);
ed9b544e 4895 return;
4896 }
4897
dd88747b 4898 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4899 return;
dd88747b 4900
ed9b544e 4901 incrRefCount(o);
3305306f 4902 deleteIfVolatile(c->db,c->argv[2]);
09241813 4903 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
ed9b544e 4904 if (nx) {
4905 decrRefCount(o);
c937aa89 4906 addReply(c,shared.czero);
ed9b544e 4907 return;
4908 }
09241813 4909 dbReplace(c->db,c->argv[2],o);
ed9b544e 4910 }
09241813 4911 dbDelete(c->db,c->argv[1]);
b167f877 4912 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4913 server.dirty++;
c937aa89 4914 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4915}
4916
4917static void renameCommand(redisClient *c) {
4918 renameGenericCommand(c,0);
4919}
4920
4921static void renamenxCommand(redisClient *c) {
4922 renameGenericCommand(c,1);
4923}
4924
4925static void moveCommand(redisClient *c) {
3305306f 4926 robj *o;
4927 redisDb *src, *dst;
ed9b544e 4928 int srcid;
4929
4930 /* Obtain source and target DB pointers */
3305306f 4931 src = c->db;
4932 srcid = c->db->id;
ed9b544e 4933 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4934 addReply(c,shared.outofrangeerr);
ed9b544e 4935 return;
4936 }
3305306f 4937 dst = c->db;
4938 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4939
4940 /* If the user is moving using as target the same
4941 * DB as the source DB it is probably an error. */
4942 if (src == dst) {
c937aa89 4943 addReply(c,shared.sameobjecterr);
ed9b544e 4944 return;
4945 }
4946
4947 /* Check if the element exists and get a reference */
3305306f 4948 o = lookupKeyWrite(c->db,c->argv[1]);
4949 if (!o) {
c937aa89 4950 addReply(c,shared.czero);
ed9b544e 4951 return;
4952 }
4953
4954 /* Try to add the element to the target DB */
3305306f 4955 deleteIfVolatile(dst,c->argv[1]);
09241813 4956 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
c937aa89 4957 addReply(c,shared.czero);
ed9b544e 4958 return;
4959 }
ed9b544e 4960 incrRefCount(o);
4961
4962 /* OK! key moved, free the entry in the source DB */
09241813 4963 dbDelete(src,c->argv[1]);
ed9b544e 4964 server.dirty++;
c937aa89 4965 addReply(c,shared.cone);
ed9b544e 4966}
4967
4968/* =================================== Lists ================================ */
d0686e07
PN
4969
4970
4971/* Check the argument length to see if it requires us to convert the ziplist
4972 * to a real list. Only check raw-encoded objects because integer encoded
4973 * objects are never too long. */
003f0840 4974static void listTypeTryConversion(robj *subject, robj *value) {
d0686e07
PN
4975 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4976 if (value->encoding == REDIS_ENCODING_RAW &&
4977 sdslen(value->ptr) > server.list_max_ziplist_value)
003f0840 4978 listTypeConvert(subject,REDIS_ENCODING_LIST);
d0686e07
PN
4979}
4980
003f0840 4981static void listTypePush(robj *subject, robj *value, int where) {
d0686e07 4982 /* Check if we need to convert the ziplist */
003f0840 4983 listTypeTryConversion(subject,value);
d0686e07
PN
4984 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
4985 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
003f0840 4986 listTypeConvert(subject,REDIS_ENCODING_LIST);
d0686e07 4987
c7d9d662
PN
4988 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4989 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4990 value = getDecodedObject(value);
4991 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4992 decrRefCount(value);
4993 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4994 if (where == REDIS_HEAD) {
4995 listAddNodeHead(subject->ptr,value);
4996 } else {
4997 listAddNodeTail(subject->ptr,value);
4998 }
4999 incrRefCount(value);
5000 } else {
5001 redisPanic("Unknown list encoding");
5002 }
5003}
5004
003f0840 5005static robj *listTypePop(robj *subject, int where) {
d72562f7
PN
5006 robj *value = NULL;
5007 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
5008 unsigned char *p;
b6eb9703 5009 unsigned char *vstr;
d72562f7 5010 unsigned int vlen;
b6eb9703 5011 long long vlong;
d72562f7
PN
5012 int pos = (where == REDIS_HEAD) ? 0 : -1;
5013 p = ziplistIndex(subject->ptr,pos);
b6eb9703
PN
5014 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5015 if (vstr) {
5016 value = createStringObject((char*)vstr,vlen);
d72562f7 5017 } else {
b6eb9703 5018 value = createStringObjectFromLongLong(vlong);
d72562f7 5019 }
0f62e177
PN
5020 /* We only need to delete an element when it exists */
5021 subject->ptr = ziplistDelete(subject->ptr,&p);
d72562f7 5022 }
d72562f7
PN
5023 } else if (subject->encoding == REDIS_ENCODING_LIST) {
5024 list *list = subject->ptr;
5025 listNode *ln;
5026 if (where == REDIS_HEAD) {
5027 ln = listFirst(list);
5028 } else {
5029 ln = listLast(list);
5030 }
5031 if (ln != NULL) {
5032 value = listNodeValue(ln);
5033 incrRefCount(value);
5034 listDelNode(list,ln);
5035 }
5036 } else {
5037 redisPanic("Unknown list encoding");
5038 }
5039 return value;
5040}
5041
003f0840 5042static unsigned long listTypeLength(robj *subject) {
d72562f7
PN
5043 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
5044 return ziplistLen(subject->ptr);
5045 } else if (subject->encoding == REDIS_ENCODING_LIST) {
5046 return listLength((list*)subject->ptr);
5047 } else {
5048 redisPanic("Unknown list encoding");
5049 }
5050}
5051
a6dd455b
PN
5052/* Structure to hold set iteration abstraction. */
5053typedef struct {
5054 robj *subject;
5055 unsigned char encoding;
be02a7c0 5056 unsigned char direction; /* Iteration direction */
a6dd455b
PN
5057 unsigned char *zi;
5058 listNode *ln;
003f0840 5059} listTypeIterator;
a6dd455b 5060
be02a7c0
PN
5061/* Structure for an entry while iterating over a list. */
5062typedef struct {
003f0840 5063 listTypeIterator *li;
be02a7c0
PN
5064 unsigned char *zi; /* Entry in ziplist */
5065 listNode *ln; /* Entry in linked list */
003f0840 5066} listTypeEntry;
be02a7c0 5067
a6dd455b 5068/* Initialize an iterator at the specified index. */
003f0840
PN
5069static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5070 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
a6dd455b
PN
5071 li->subject = subject;
5072 li->encoding = subject->encoding;
be02a7c0 5073 li->direction = direction;
a6dd455b
PN
5074 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5075 li->zi = ziplistIndex(subject->ptr,index);
5076 } else if (li->encoding == REDIS_ENCODING_LIST) {
5077 li->ln = listIndex(subject->ptr,index);
5078 } else {
5079 redisPanic("Unknown list encoding");
5080 }
5081 return li;
5082}
5083
5084/* Clean up the iterator. */
003f0840 5085static void listTypeReleaseIterator(listTypeIterator *li) {
a6dd455b
PN
5086 zfree(li);
5087}
5088
be02a7c0
PN
5089/* Stores pointer to current the entry in the provided entry structure
5090 * and advances the position of the iterator. Returns 1 when the current
5091 * entry is in fact an entry, 0 otherwise. */
003f0840 5092static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
dda20542
PN
5093 /* Protect from converting when iterating */
5094 redisAssert(li->subject->encoding == li->encoding);
5095
be02a7c0 5096 entry->li = li;
d2ee16ab 5097 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
5098 entry->zi = li->zi;
5099 if (entry->zi != NULL) {
5100 if (li->direction == REDIS_TAIL)
5101 li->zi = ziplistNext(li->subject->ptr,li->zi);
5102 else
5103 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5104 return 1;
5105 }
d2ee16ab 5106 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
5107 entry->ln = li->ln;
5108 if (entry->ln != NULL) {
5109 if (li->direction == REDIS_TAIL)
5110 li->ln = li->ln->next;
5111 else
5112 li->ln = li->ln->prev;
5113 return 1;
5114 }
d2ee16ab
PN
5115 } else {
5116 redisPanic("Unknown list encoding");
5117 }
be02a7c0 5118 return 0;
d2ee16ab
PN
5119}
5120
a6dd455b 5121/* Return entry or NULL at the current position of the iterator. */
003f0840
PN
5122static robj *listTypeGet(listTypeEntry *entry) {
5123 listTypeIterator *li = entry->li;
a6dd455b
PN
5124 robj *value = NULL;
5125 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
b6eb9703 5126 unsigned char *vstr;
a6dd455b 5127 unsigned int vlen;
b6eb9703 5128 long long vlong;
be02a7c0 5129 redisAssert(entry->zi != NULL);
b6eb9703
PN
5130 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5131 if (vstr) {
5132 value = createStringObject((char*)vstr,vlen);
a6dd455b 5133 } else {
b6eb9703 5134 value = createStringObjectFromLongLong(vlong);
a6dd455b
PN
5135 }
5136 }
5137 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
5138 redisAssert(entry->ln != NULL);
5139 value = listNodeValue(entry->ln);
a6dd455b
PN
5140 incrRefCount(value);
5141 } else {
5142 redisPanic("Unknown list encoding");
5143 }
5144 return value;
5145}
5146
d2ee16ab 5147/* Compare the given object with the entry at the current position. */
003f0840
PN
5148static int listTypeEqual(listTypeEntry *entry, robj *o) {
5149 listTypeIterator *li = entry->li;
d2ee16ab
PN
5150 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5151 redisAssert(o->encoding == REDIS_ENCODING_RAW);
be02a7c0 5152 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
d2ee16ab 5153 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0 5154 return equalStringObjects(o,listNodeValue(entry->ln));
d2ee16ab
PN
5155 } else {
5156 redisPanic("Unknown list encoding");
5157 }
5158}
5159
be02a7c0 5160/* Delete the element pointed to. */
003f0840
PN
5161static void listTypeDelete(listTypeEntry *entry) {
5162 listTypeIterator *li = entry->li;
a6dd455b 5163 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
5164 unsigned char *p = entry->zi;
5165 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5166
5167 /* Update position of the iterator depending on the direction */
5168 if (li->direction == REDIS_TAIL)
5169 li->zi = p;
a6dd455b 5170 else
be02a7c0
PN
5171 li->zi = ziplistPrev(li->subject->ptr,p);
5172 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5173 listNode *next;
5174 if (li->direction == REDIS_TAIL)
5175 next = entry->ln->next;
a6dd455b 5176 else
be02a7c0
PN
5177 next = entry->ln->prev;
5178 listDelNode(li->subject->ptr,entry->ln);
5179 li->ln = next;
a6dd455b
PN
5180 } else {
5181 redisPanic("Unknown list encoding");
5182 }
5183}
3305306f 5184
003f0840
PN
5185static void listTypeConvert(robj *subject, int enc) {
5186 listTypeIterator *li;
5187 listTypeEntry entry;
d0686e07
PN
5188 redisAssert(subject->type == REDIS_LIST);
5189
5190 if (enc == REDIS_ENCODING_LIST) {
5191 list *l = listCreate();
cd627d4e 5192 listSetFreeMethod(l,decrRefCount);
d0686e07 5193
003f0840
PN
5194 /* listTypeGet returns a robj with incremented refcount */
5195 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5196 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5197 listTypeReleaseIterator(li);
d0686e07
PN
5198
5199 subject->encoding = REDIS_ENCODING_LIST;
5200 zfree(subject->ptr);
5201 subject->ptr = l;
5202 } else {
5203 redisPanic("Unsupported list conversion");
5204 }
5205}
5206
c7d9d662
PN
5207static void pushGenericCommand(redisClient *c, int where) {
5208 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
3305306f 5209 if (lobj == NULL) {
95242ab5 5210 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5211 addReply(c,shared.cone);
95242ab5 5212 return;
5213 }
1cd92e7f 5214 lobj = createZiplistObject();
09241813 5215 dbAdd(c->db,c->argv[1],lobj);
ed9b544e 5216 } else {
ed9b544e 5217 if (lobj->type != REDIS_LIST) {
5218 addReply(c,shared.wrongtypeerr);
5219 return;
5220 }
95242ab5 5221 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5222 addReply(c,shared.cone);
95242ab5 5223 return;
5224 }
ed9b544e 5225 }
003f0840
PN
5226 listTypePush(lobj,c->argv[2],where);
5227 addReplyLongLong(c,listTypeLength(lobj));
ed9b544e 5228 server.dirty++;
ed9b544e 5229}
5230
5231static void lpushCommand(redisClient *c) {
5232 pushGenericCommand(c,REDIS_HEAD);
5233}
5234
5235static void rpushCommand(redisClient *c) {
5236 pushGenericCommand(c,REDIS_TAIL);
5237}
5238
5239static void llenCommand(redisClient *c) {
d72562f7
PN
5240 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5241 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
003f0840 5242 addReplyUlong(c,listTypeLength(o));
ed9b544e 5243}
5244
5245static void lindexCommand(redisClient *c) {
697bd567
PN
5246 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5247 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5248 int index = atoi(c->argv[2]->ptr);
bd8db0ad 5249 robj *value = NULL;
dd88747b 5250
697bd567
PN
5251 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5252 unsigned char *p;
b6eb9703 5253 unsigned char *vstr;
697bd567 5254 unsigned int vlen;
b6eb9703 5255 long long vlong;
697bd567 5256 p = ziplistIndex(o->ptr,index);
b6eb9703
PN
5257 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5258 if (vstr) {
5259 value = createStringObject((char*)vstr,vlen);
697bd567 5260 } else {
b6eb9703 5261 value = createStringObjectFromLongLong(vlong);
697bd567 5262 }
bd8db0ad
PN
5263 addReplyBulk(c,value);
5264 decrRefCount(value);
697bd567
PN
5265 } else {
5266 addReply(c,shared.nullbulk);
5267 }
5268 } else if (o->encoding == REDIS_ENCODING_LIST) {
5269 listNode *ln = listIndex(o->ptr,index);
5270 if (ln != NULL) {
bd8db0ad
PN
5271 value = listNodeValue(ln);
5272 addReplyBulk(c,value);
697bd567
PN
5273 } else {
5274 addReply(c,shared.nullbulk);
5275 }
ed9b544e 5276 } else {
697bd567 5277 redisPanic("Unknown list encoding");
ed9b544e 5278 }
5279}
5280
5281static void lsetCommand(redisClient *c) {
697bd567
PN
5282 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5283 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5284 int index = atoi(c->argv[2]->ptr);
697bd567 5285 robj *value = c->argv[3];
dd88747b 5286
003f0840 5287 listTypeTryConversion(o,value);
697bd567
PN
5288 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5289 unsigned char *p, *zl = o->ptr;
5290 p = ziplistIndex(zl,index);
5291 if (p == NULL) {
5292 addReply(c,shared.outofrangeerr);
5293 } else {
be02a7c0 5294 o->ptr = ziplistDelete(o->ptr,&p);
697bd567
PN
5295 value = getDecodedObject(value);
5296 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5297 decrRefCount(value);
5298 addReply(c,shared.ok);
5299 server.dirty++;
5300 }
5301 } else if (o->encoding == REDIS_ENCODING_LIST) {
5302 listNode *ln = listIndex(o->ptr,index);
5303 if (ln == NULL) {
5304 addReply(c,shared.outofrangeerr);
5305 } else {
5306 decrRefCount((robj*)listNodeValue(ln));
5307 listNodeValue(ln) = value;
5308 incrRefCount(value);
5309 addReply(c,shared.ok);
5310 server.dirty++;
5311 }
ed9b544e 5312 } else {
697bd567 5313 redisPanic("Unknown list encoding");
ed9b544e 5314 }
5315}
5316
5317static void popGenericCommand(redisClient *c, int where) {
d72562f7
PN
5318 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5319 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
3305306f 5320
003f0840 5321 robj *value = listTypePop(o,where);
d72562f7 5322 if (value == NULL) {
dd88747b 5323 addReply(c,shared.nullbulk);
5324 } else {
d72562f7
PN
5325 addReplyBulk(c,value);
5326 decrRefCount(value);
003f0840 5327 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5328 server.dirty++;
ed9b544e 5329 }
5330}
5331
5332static void lpopCommand(redisClient *c) {
5333 popGenericCommand(c,REDIS_HEAD);
5334}
5335
5336static void rpopCommand(redisClient *c) {
5337 popGenericCommand(c,REDIS_TAIL);
5338}
5339
5340static void lrangeCommand(redisClient *c) {
a6dd455b 5341 robj *o, *value;
ed9b544e 5342 int start = atoi(c->argv[2]->ptr);
5343 int end = atoi(c->argv[3]->ptr);
dd88747b 5344 int llen;
5345 int rangelen, j;
003f0840 5346 listTypeEntry entry;
dd88747b 5347
4e27f268 5348 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5349 || checkType(c,o,REDIS_LIST)) return;
003f0840 5350 llen = listTypeLength(o);
dd88747b 5351
5352 /* convert negative indexes */
5353 if (start < 0) start = llen+start;
5354 if (end < 0) end = llen+end;
5355 if (start < 0) start = 0;
5356 if (end < 0) end = 0;
5357
5358 /* indexes sanity checks */
5359 if (start > end || start >= llen) {
5360 /* Out of range start or start > end result in empty list */
5361 addReply(c,shared.emptymultibulk);
5362 return;
5363 }
5364 if (end >= llen) end = llen-1;
5365 rangelen = (end-start)+1;
3305306f 5366
dd88747b 5367 /* Return the result in form of a multi-bulk reply */
dd88747b 5368 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
003f0840 5369 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
dd88747b 5370 for (j = 0; j < rangelen; j++) {
003f0840
PN
5371 redisAssert(listTypeNext(li,&entry));
5372 value = listTypeGet(&entry);
a6dd455b 5373 addReplyBulk(c,value);
be02a7c0 5374 decrRefCount(value);
ed9b544e 5375 }
003f0840 5376 listTypeReleaseIterator(li);
ed9b544e 5377}
5378
5379static void ltrimCommand(redisClient *c) {
3305306f 5380 robj *o;
ed9b544e 5381 int start = atoi(c->argv[2]->ptr);
5382 int end = atoi(c->argv[3]->ptr);
dd88747b 5383 int llen;
5384 int j, ltrim, rtrim;
5385 list *list;
5386 listNode *ln;
5387
5388 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5389 checkType(c,o,REDIS_LIST)) return;
003f0840 5390 llen = listTypeLength(o);
dd88747b 5391
5392 /* convert negative indexes */
5393 if (start < 0) start = llen+start;
5394 if (end < 0) end = llen+end;
5395 if (start < 0) start = 0;
5396 if (end < 0) end = 0;
5397
5398 /* indexes sanity checks */
5399 if (start > end || start >= llen) {
5400 /* Out of range start or start > end result in empty list */
5401 ltrim = llen;
5402 rtrim = 0;
ed9b544e 5403 } else {
dd88747b 5404 if (end >= llen) end = llen-1;
5405 ltrim = start;
5406 rtrim = llen-end-1;
5407 }
ed9b544e 5408
dd88747b 5409 /* Remove list elements to perform the trim */
9ae6b0be
PN
5410 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5411 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5412 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5413 } else if (o->encoding == REDIS_ENCODING_LIST) {
5414 list = o->ptr;
5415 for (j = 0; j < ltrim; j++) {
5416 ln = listFirst(list);
5417 listDelNode(list,ln);
5418 }
5419 for (j = 0; j < rtrim; j++) {
5420 ln = listLast(list);
5421 listDelNode(list,ln);
5422 }
5423 } else {
5424 redisPanic("Unknown list encoding");
ed9b544e 5425 }
003f0840 5426 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5427 server.dirty++;
5428 addReply(c,shared.ok);
ed9b544e 5429}
5430
5431static void lremCommand(redisClient *c) {
d2ee16ab 5432 robj *subject, *obj = c->argv[3];
dd88747b 5433 int toremove = atoi(c->argv[2]->ptr);
5434 int removed = 0;
003f0840 5435 listTypeEntry entry;
a4d1ba9a 5436
d2ee16ab
PN
5437 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5438 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
dd88747b 5439
d2ee16ab
PN
5440 /* Make sure obj is raw when we're dealing with a ziplist */
5441 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5442 obj = getDecodedObject(obj);
5443
003f0840 5444 listTypeIterator *li;
dd88747b 5445 if (toremove < 0) {
5446 toremove = -toremove;
003f0840 5447 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
d2ee16ab 5448 } else {
003f0840 5449 li = listTypeInitIterator(subject,0,REDIS_TAIL);
dd88747b 5450 }
dd88747b 5451
003f0840
PN
5452 while (listTypeNext(li,&entry)) {
5453 if (listTypeEqual(&entry,obj)) {
5454 listTypeDelete(&entry);
dd88747b 5455 server.dirty++;
5456 removed++;
3fbf9001 5457 if (toremove && removed == toremove) break;
ed9b544e 5458 }
5459 }
003f0840 5460 listTypeReleaseIterator(li);
d2ee16ab
PN
5461
5462 /* Clean up raw encoded object */
5463 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5464 decrRefCount(obj);
5465
003f0840 5466 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5467 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5468}
5469
12f9d551 5470/* This is the semantic of this command:
0f5f7e9a 5471 * RPOPLPUSH srclist dstlist:
12f9d551 5472 * IF LLEN(srclist) > 0
5473 * element = RPOP srclist
5474 * LPUSH dstlist element
5475 * RETURN element
5476 * ELSE
5477 * RETURN nil
5478 * END
5479 * END
5480 *
5481 * The idea is to be able to get an element from a list in a reliable way
5482 * since the element is not just returned but pushed against another list
5483 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5484 */
0f5f7e9a 5485static void rpoplpushcommand(redisClient *c) {
0f62e177 5486 robj *sobj, *value;
dd88747b 5487 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5488 checkType(c,sobj,REDIS_LIST)) return;
12f9d551 5489
003f0840 5490 if (listTypeLength(sobj) == 0) {
12f9d551 5491 addReply(c,shared.nullbulk);
5492 } else {
dd88747b 5493 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
0f62e177 5494 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
003f0840 5495 value = listTypePop(sobj,REDIS_TAIL);
12f9d551 5496
dd88747b 5497 /* Add the element to the target list (unless it's directly
5498 * passed to some BLPOP-ing client */
0f62e177
PN
5499 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5500 /* Create the list if the key does not exist */
5501 if (!dobj) {
1cd92e7f 5502 dobj = createZiplistObject();
09241813 5503 dbAdd(c->db,c->argv[2],dobj);
12f9d551 5504 }
003f0840 5505 listTypePush(dobj,value,REDIS_HEAD);
12f9d551 5506 }
dd88747b 5507
5508 /* Send the element to the client as reply as well */
0f62e177
PN
5509 addReplyBulk(c,value);
5510
003f0840 5511 /* listTypePop returns an object with its refcount incremented */
0f62e177 5512 decrRefCount(value);
dd88747b 5513
0f62e177 5514 /* Delete the source list when it is empty */
003f0840 5515 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5516 server.dirty++;
12f9d551 5517 }
5518}
5519
ed9b544e 5520/* ==================================== Sets ================================ */
5521
d0b58d53
PN
5522/* Factory method to return a set that *can* hold "value". When the object has
5523 * an integer-encodable value, an intset will be returned. Otherwise a regular
5524 * hash table. */
5525static robj *setTypeCreate(robj *value) {
5526 if (getLongLongFromObject(value,NULL) == REDIS_OK)
5527 return createIntsetObject();
5528 return createSetObject();
5529}
5530
35cabcb5 5531static int setTypeAdd(robj *subject, robj *value) {
d0b58d53 5532 long long llval;
35cabcb5
PN
5533 if (subject->encoding == REDIS_ENCODING_HT) {
5534 if (dictAdd(subject->ptr,value,NULL) == DICT_OK) {
5535 incrRefCount(value);
5536 return 1;
5537 }
d0b58d53
PN
5538 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5539 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
70ff3511 5540 uint8_t success = 0;
d0b58d53 5541 subject->ptr = intsetAdd(subject->ptr,llval,&success);
70ff3511
PN
5542 if (success) {
5543 /* Convert to regular set when the intset contains
5544 * too many entries. */
5545 if (intsetLen(subject->ptr) > server.set_max_intset_entries)
5546 setTypeConvert(subject,REDIS_ENCODING_HT);
5547 return 1;
5548 }
d0b58d53
PN
5549 } else {
5550 /* Failed to get integer from object, convert to regular set. */
5551 setTypeConvert(subject,REDIS_ENCODING_HT);
5552
5553 /* The set *was* an intset and this value is not integer
5554 * encodable, so dictAdd should always work. */
5555 redisAssert(dictAdd(subject->ptr,value,NULL) == DICT_OK);
5556 incrRefCount(value);
5557 return 1;
5558 }
35cabcb5
PN
5559 } else {
5560 redisPanic("Unknown set encoding");
5561 }
5562 return 0;
5563}
5564
5565static int setTypeRemove(robj *subject, robj *value) {
d0b58d53 5566 long long llval;
35cabcb5
PN
5567 if (subject->encoding == REDIS_ENCODING_HT) {
5568 if (dictDelete(subject->ptr,value) == DICT_OK) {
5569 if (htNeedsResize(subject->ptr)) dictResize(subject->ptr);
5570 return 1;
5571 }
d0b58d53
PN
5572 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5573 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5574 uint8_t success;
5575 subject->ptr = intsetRemove(subject->ptr,llval,&success);
5576 if (success) return 1;
5577 }
35cabcb5
PN
5578 } else {
5579 redisPanic("Unknown set encoding");
5580 }
5581 return 0;
5582}
5583
5584static int setTypeIsMember(robj *subject, robj *value) {
d0b58d53 5585 long long llval;
35cabcb5
PN
5586 if (subject->encoding == REDIS_ENCODING_HT) {
5587 return dictFind((dict*)subject->ptr,value) != NULL;
d0b58d53
PN
5588 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5589 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5590 return intsetFind((intset*)subject->ptr,llval);
5591 }
35cabcb5
PN
5592 } else {
5593 redisPanic("Unknown set encoding");
5594 }
d0b58d53 5595 return 0;
35cabcb5
PN
5596}
5597
5598/* Structure to hold set iteration abstraction. */
5599typedef struct {
d0b58d53 5600 robj *subject;
35cabcb5 5601 int encoding;
d0b58d53 5602 int ii; /* intset iterator */
35cabcb5
PN
5603 dictIterator *di;
5604} setIterator;
5605
5606static setIterator *setTypeInitIterator(robj *subject) {
5607 setIterator *si = zmalloc(sizeof(setIterator));
d0b58d53 5608 si->subject = subject;
35cabcb5
PN
5609 si->encoding = subject->encoding;
5610 if (si->encoding == REDIS_ENCODING_HT) {
5611 si->di = dictGetIterator(subject->ptr);
d0b58d53
PN
5612 } else if (si->encoding == REDIS_ENCODING_INTSET) {
5613 si->ii = 0;
35cabcb5
PN
5614 } else {
5615 redisPanic("Unknown set encoding");
5616 }
5617 return si;
5618}
5619
5620static void setTypeReleaseIterator(setIterator *si) {
5621 if (si->encoding == REDIS_ENCODING_HT)
5622 dictReleaseIterator(si->di);
5623 zfree(si);
5624}
5625
5626/* Move to the next entry in the set. Returns the object at the current
5627 * position, or NULL when the end is reached. This object will have its
5628 * refcount incremented, so the caller needs to take care of this. */
5629static robj *setTypeNext(setIterator *si) {
5630 robj *ret = NULL;
5631 if (si->encoding == REDIS_ENCODING_HT) {
5632 dictEntry *de = dictNext(si->di);
5633 if (de != NULL) {
5634 ret = dictGetEntryKey(de);
5635 incrRefCount(ret);
5636 }
d0b58d53
PN
5637 } else if (si->encoding == REDIS_ENCODING_INTSET) {
5638 long long llval;
5639 if (intsetGet(si->subject->ptr,si->ii++,&llval))
5640 ret = createStringObjectFromLongLong(llval);
35cabcb5
PN
5641 }
5642 return ret;
5643}
5644
5645
5646/* Return random element from set. The returned object will always have
5647 * an incremented refcount. */
5648robj *setTypeRandomElement(robj *subject) {
5649 robj *ret = NULL;
5650 if (subject->encoding == REDIS_ENCODING_HT) {
5651 dictEntry *de = dictGetRandomKey(subject->ptr);
5652 ret = dictGetEntryKey(de);
5653 incrRefCount(ret);
d0b58d53
PN
5654 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5655 long long llval = intsetRandom(subject->ptr);
5656 ret = createStringObjectFromLongLong(llval);
35cabcb5
PN
5657 } else {
5658 redisPanic("Unknown set encoding");
5659 }
5660 return ret;
5661}
5662
5663static unsigned long setTypeSize(robj *subject) {
5664 if (subject->encoding == REDIS_ENCODING_HT) {
5665 return dictSize((dict*)subject->ptr);
d0b58d53
PN
5666 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5667 return intsetLen((intset*)subject->ptr);
35cabcb5
PN
5668 } else {
5669 redisPanic("Unknown set encoding");
5670 }
5671}
5672
400aea2b
PN
5673/* Convert the set to specified encoding. The resulting dict (when converting
5674 * to a hashtable) is presized to hold the number of elements in the original
5675 * set. */
d0b58d53
PN
5676static void setTypeConvert(robj *subject, int enc) {
5677 setIterator *si;
5678 robj *element;
5679 redisAssert(subject->type == REDIS_SET);
5680
5681 if (enc == REDIS_ENCODING_HT) {
5682 dict *d = dictCreate(&setDictType,NULL);
400aea2b
PN
5683 /* Presize the dict to avoid rehashing */
5684 dictExpand(d,intsetLen(subject->ptr));
d0b58d53
PN
5685
5686 /* setTypeGet returns a robj with incremented refcount */
5687 si = setTypeInitIterator(subject);
5688 while ((element = setTypeNext(si)) != NULL)
5689 redisAssert(dictAdd(d,element,NULL) == DICT_OK);
5690 setTypeReleaseIterator(si);
5691
5692 subject->encoding = REDIS_ENCODING_HT;
5693 zfree(subject->ptr);
5694 subject->ptr = d;
5695 } else {
5696 redisPanic("Unsupported set conversion");
5697 }
5698}
5699
ed9b544e 5700static void saddCommand(redisClient *c) {
ed9b544e 5701 robj *set;
5702
3305306f 5703 set = lookupKeyWrite(c->db,c->argv[1]);
5704 if (set == NULL) {
d0b58d53 5705 set = setTypeCreate(c->argv[2]);
09241813 5706 dbAdd(c->db,c->argv[1],set);
ed9b544e 5707 } else {
ed9b544e 5708 if (set->type != REDIS_SET) {
c937aa89 5709 addReply(c,shared.wrongtypeerr);
ed9b544e 5710 return;
5711 }
5712 }
35cabcb5 5713 if (setTypeAdd(set,c->argv[2])) {
ed9b544e 5714 server.dirty++;
c937aa89 5715 addReply(c,shared.cone);
ed9b544e 5716 } else {
c937aa89 5717 addReply(c,shared.czero);
ed9b544e 5718 }
5719}
5720
5721static void sremCommand(redisClient *c) {
3305306f 5722 robj *set;
ed9b544e 5723
dd88747b 5724 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5725 checkType(c,set,REDIS_SET)) return;
5726
35cabcb5
PN
5727 if (setTypeRemove(set,c->argv[2])) {
5728 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5729 server.dirty++;
dd88747b 5730 addReply(c,shared.cone);
ed9b544e 5731 } else {
dd88747b 5732 addReply(c,shared.czero);
ed9b544e 5733 }
5734}
5735
a4460ef4 5736static void smoveCommand(redisClient *c) {
b978abbf 5737 robj *srcset, *dstset, *ele;
a4460ef4 5738 srcset = lookupKeyWrite(c->db,c->argv[1]);
5739 dstset = lookupKeyWrite(c->db,c->argv[2]);
b978abbf 5740 ele = c->argv[3];
a4460ef4 5741
b978abbf
PN
5742 /* If the source key does not exist return 0 */
5743 if (srcset == NULL) {
5744 addReply(c,shared.czero);
a4460ef4 5745 return;
5746 }
b978abbf
PN
5747
5748 /* If the source key has the wrong type, or the destination key
5749 * is set and has the wrong type, return with an error. */
5750 if (checkType(c,srcset,REDIS_SET) ||
5751 (dstset && checkType(c,dstset,REDIS_SET))) return;
5752
5753 /* If srcset and dstset are equal, SMOVE is a no-op */
5754 if (srcset == dstset) {
5755 addReply(c,shared.cone);
a4460ef4 5756 return;
5757 }
b978abbf
PN
5758
5759 /* If the element cannot be removed from the src set, return 0. */
5760 if (!setTypeRemove(srcset,ele)) {
a4460ef4 5761 addReply(c,shared.czero);
5762 return;
5763 }
b978abbf
PN
5764
5765 /* Remove the src set from the database when empty */
5766 if (setTypeSize(srcset) == 0) dbDelete(c->db,c->argv[1]);
a4460ef4 5767 server.dirty++;
b978abbf
PN
5768
5769 /* Create the destination set when it doesn't exist */
a4460ef4 5770 if (!dstset) {
b978abbf 5771 dstset = setTypeCreate(ele);
09241813 5772 dbAdd(c->db,c->argv[2],dstset);
a4460ef4 5773 }
b978abbf
PN
5774
5775 /* An extra key has changed when ele was successfully added to dstset */
5776 if (setTypeAdd(dstset,ele)) server.dirty++;
a4460ef4 5777 addReply(c,shared.cone);
5778}
5779
ed9b544e 5780static void sismemberCommand(redisClient *c) {
3305306f 5781 robj *set;
ed9b544e 5782
dd88747b 5783 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5784 checkType(c,set,REDIS_SET)) return;
5785
35cabcb5 5786 if (setTypeIsMember(set,c->argv[2]))
dd88747b 5787 addReply(c,shared.cone);
5788 else
c937aa89 5789 addReply(c,shared.czero);
ed9b544e 5790}
5791
5792static void scardCommand(redisClient *c) {
3305306f 5793 robj *o;
dd88747b 5794
5795 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5796 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5797
35cabcb5 5798 addReplyUlong(c,setTypeSize(o));
ed9b544e 5799}
5800
12fea928 5801static void spopCommand(redisClient *c) {
35cabcb5 5802 robj *set, *ele;
12fea928 5803
dd88747b 5804 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5805 checkType(c,set,REDIS_SET)) return;
5806
35cabcb5
PN
5807 ele = setTypeRandomElement(set);
5808 if (ele == NULL) {
12fea928 5809 addReply(c,shared.nullbulk);
5810 } else {
35cabcb5 5811 setTypeRemove(set,ele);
dd88747b 5812 addReplyBulk(c,ele);
35cabcb5
PN
5813 decrRefCount(ele);
5814 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5815 server.dirty++;
12fea928 5816 }
5817}
5818
2abb95a9 5819static void srandmemberCommand(redisClient *c) {
35cabcb5 5820 robj *set, *ele;
2abb95a9 5821
dd88747b 5822 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5823 checkType(c,set,REDIS_SET)) return;
5824
35cabcb5
PN
5825 ele = setTypeRandomElement(set);
5826 if (ele == NULL) {
2abb95a9 5827 addReply(c,shared.nullbulk);
5828 } else {
dd88747b 5829 addReplyBulk(c,ele);
35cabcb5 5830 decrRefCount(ele);
2abb95a9 5831 }
5832}
5833
ed9b544e 5834static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
35cabcb5 5835 return setTypeSize(*(robj**)s1)-setTypeSize(*(robj**)s2);
ed9b544e 5836}
5837
35cabcb5
PN
5838static void sinterGenericCommand(redisClient *c, robj **setkeys, unsigned long setnum, robj *dstkey) {
5839 robj **sets = zmalloc(sizeof(robj*)*setnum);
5840 setIterator *si;
5841 robj *ele, *lenobj = NULL, *dstset = NULL;
682ac724 5842 unsigned long j, cardinality = 0;
ed9b544e 5843
35cabcb5
PN
5844 for (j = 0; j < setnum; j++) {
5845 robj *setobj = dstkey ?
5846 lookupKeyWrite(c->db,setkeys[j]) :
5847 lookupKeyRead(c->db,setkeys[j]);
3305306f 5848 if (!setobj) {
35cabcb5 5849 zfree(sets);
5faa6025 5850 if (dstkey) {
09241813 5851 if (dbDelete(c->db,dstkey))
fdcaae84 5852 server.dirty++;
0d36ded0 5853 addReply(c,shared.czero);
5faa6025 5854 } else {
4e27f268 5855 addReply(c,shared.emptymultibulk);
5faa6025 5856 }
ed9b544e 5857 return;
5858 }
35cabcb5
PN
5859 if (checkType(c,setobj,REDIS_SET)) {
5860 zfree(sets);
ed9b544e 5861 return;
5862 }
35cabcb5 5863 sets[j] = setobj;
ed9b544e 5864 }
5865 /* Sort sets from the smallest to largest, this will improve our
5866 * algorithm's performace */
35cabcb5 5867 qsort(sets,setnum,sizeof(robj*),qsortCompareSetsByCardinality);
ed9b544e 5868
5869 /* The first thing we should output is the total number of elements...
5870 * since this is a multi-bulk write, but at this stage we don't know
5871 * the intersection set size, so we use a trick, append an empty object
5872 * to the output list and save the pointer to later modify it with the
5873 * right length */
5874 if (!dstkey) {
5875 lenobj = createObject(REDIS_STRING,NULL);
5876 addReply(c,lenobj);
5877 decrRefCount(lenobj);
5878 } else {
5879 /* If we have a target key where to store the resulting set
5880 * create this key with an empty set inside */
d0b58d53 5881 dstset = createIntsetObject();
ed9b544e 5882 }
5883
5884 /* Iterate all the elements of the first (smallest) set, and test
5885 * the element against all the other sets, if at least one set does
5886 * not include the element it is discarded */
35cabcb5
PN
5887 si = setTypeInitIterator(sets[0]);
5888 while((ele = setTypeNext(si)) != NULL) {
5889 for (j = 1; j < setnum; j++)
5890 if (!setTypeIsMember(sets[j],ele)) break;
5891
5892 /* Only take action when all sets contain the member */
5893 if (j == setnum) {
5894 if (!dstkey) {
5895 addReplyBulk(c,ele);
5896 cardinality++;
5897 } else {
5898 setTypeAdd(dstset,ele);
5899 }
ed9b544e 5900 }
35cabcb5 5901 decrRefCount(ele);
ed9b544e 5902 }
35cabcb5 5903 setTypeReleaseIterator(si);
ed9b544e 5904
83cdfe18 5905 if (dstkey) {
3ea27d37 5906 /* Store the resulting set into the target, if the intersection
5907 * is not an empty set. */
09241813 5908 dbDelete(c->db,dstkey);
35cabcb5 5909 if (setTypeSize(dstset) > 0) {
09241813 5910 dbAdd(c->db,dstkey,dstset);
35cabcb5 5911 addReplyLongLong(c,setTypeSize(dstset));
3ea27d37 5912 } else {
5913 decrRefCount(dstset);
d36c4e97 5914 addReply(c,shared.czero);
3ea27d37 5915 }
40d224a9 5916 server.dirty++;
d36c4e97 5917 } else {
5918 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5919 }
35cabcb5 5920 zfree(sets);
ed9b544e 5921}
5922
5923static void sinterCommand(redisClient *c) {
5924 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5925}
5926
5927static void sinterstoreCommand(redisClient *c) {
5928 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5929}
5930
f4f56e1d 5931#define REDIS_OP_UNION 0
5932#define REDIS_OP_DIFF 1
2830ca53 5933#define REDIS_OP_INTER 2
f4f56e1d 5934
35cabcb5
PN
5935static void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, robj *dstkey, int op) {
5936 robj **sets = zmalloc(sizeof(robj*)*setnum);
5937 setIterator *si;
5938 robj *ele, *dstset = NULL;
40d224a9 5939 int j, cardinality = 0;
5940
35cabcb5
PN
5941 for (j = 0; j < setnum; j++) {
5942 robj *setobj = dstkey ?
5943 lookupKeyWrite(c->db,setkeys[j]) :
5944 lookupKeyRead(c->db,setkeys[j]);
40d224a9 5945 if (!setobj) {
35cabcb5 5946 sets[j] = NULL;
40d224a9 5947 continue;
5948 }
35cabcb5
PN
5949 if (checkType(c,setobj,REDIS_SET)) {
5950 zfree(sets);
40d224a9 5951 return;
5952 }
35cabcb5 5953 sets[j] = setobj;
40d224a9 5954 }
5955
5956 /* We need a temp set object to store our union. If the dstkey
5957 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5958 * this set object will be the resulting object to set into the target key*/
d0b58d53 5959 dstset = createIntsetObject();
40d224a9 5960
40d224a9 5961 /* Iterate all the elements of all the sets, add every element a single
5962 * time to the result set */
35cabcb5
PN
5963 for (j = 0; j < setnum; j++) {
5964 if (op == REDIS_OP_DIFF && j == 0 && !sets[j]) break; /* result set is empty */
5965 if (!sets[j]) continue; /* non existing keys are like empty sets */
40d224a9 5966
35cabcb5
PN
5967 si = setTypeInitIterator(sets[j]);
5968 while((ele = setTypeNext(si)) != NULL) {
f4f56e1d 5969 if (op == REDIS_OP_UNION || j == 0) {
35cabcb5 5970 if (setTypeAdd(dstset,ele)) {
40d224a9 5971 cardinality++;
5972 }
f4f56e1d 5973 } else if (op == REDIS_OP_DIFF) {
35cabcb5 5974 if (setTypeRemove(dstset,ele)) {
f4f56e1d 5975 cardinality--;
5976 }
40d224a9 5977 }
35cabcb5 5978 decrRefCount(ele);
40d224a9 5979 }
35cabcb5 5980 setTypeReleaseIterator(si);
51829ed3 5981
35cabcb5 5982 /* Exit when result set is empty. */
d36c4e97 5983 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5984 }
5985
f4f56e1d 5986 /* Output the content of the resulting set, if not in STORE mode */
5987 if (!dstkey) {
5988 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
35cabcb5
PN
5989 si = setTypeInitIterator(dstset);
5990 while((ele = setTypeNext(si)) != NULL) {
dd88747b 5991 addReplyBulk(c,ele);
35cabcb5 5992 decrRefCount(ele);
f4f56e1d 5993 }
35cabcb5 5994 setTypeReleaseIterator(si);
d36c4e97 5995 decrRefCount(dstset);
83cdfe18
AG
5996 } else {
5997 /* If we have a target key where to store the resulting set
5998 * create this key with the result set inside */
09241813 5999 dbDelete(c->db,dstkey);
35cabcb5 6000 if (setTypeSize(dstset) > 0) {
09241813 6001 dbAdd(c->db,dstkey,dstset);
35cabcb5 6002 addReplyLongLong(c,setTypeSize(dstset));
3ea27d37 6003 } else {
6004 decrRefCount(dstset);
d36c4e97 6005 addReply(c,shared.czero);
3ea27d37 6006 }
40d224a9 6007 server.dirty++;
6008 }
35cabcb5 6009 zfree(sets);
40d224a9 6010}
6011
6012static void sunionCommand(redisClient *c) {
f4f56e1d 6013 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 6014}
6015
6016static void sunionstoreCommand(redisClient *c) {
f4f56e1d 6017 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
6018}
6019
6020static void sdiffCommand(redisClient *c) {
6021 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
6022}
6023
6024static void sdiffstoreCommand(redisClient *c) {
6025 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 6026}
6027
6b47e12e 6028/* ==================================== ZSets =============================== */
6029
6030/* ZSETs are ordered sets using two data structures to hold the same elements
6031 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
6032 * data structure.
6033 *
6034 * The elements are added to an hash table mapping Redis objects to scores.
6035 * At the same time the elements are added to a skip list mapping scores
6036 * to Redis objects (so objects are sorted by scores in this "view"). */
6037
6038/* This skiplist implementation is almost a C translation of the original
6039 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
6040 * Alternative to Balanced Trees", modified in three ways:
6041 * a) this implementation allows for repeated values.
6042 * b) the comparison is not just by key (our 'score') but by satellite data.
6043 * c) there is a back pointer, so it's a doubly linked list with the back
6044 * pointers being only at "level 1". This allows to traverse the list
6045 * from tail to head, useful for ZREVRANGE. */
6046
6047static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
6048 zskiplistNode *zn = zmalloc(sizeof(*zn));
6049
6050 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 6051 if (level > 1)
2b37892e 6052 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 6053 else
6054 zn->span = NULL;
6b47e12e 6055 zn->score = score;
6056 zn->obj = obj;
6057 return zn;
6058}
6059
6060static zskiplist *zslCreate(void) {
6061 int j;
6062 zskiplist *zsl;
e0a62c7f 6063
6b47e12e 6064 zsl = zmalloc(sizeof(*zsl));
6065 zsl->level = 1;
cc812361 6066 zsl->length = 0;
6b47e12e 6067 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 6068 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 6069 zsl->header->forward[j] = NULL;
94e543b5 6070
6071 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
6072 if (j < ZSKIPLIST_MAXLEVEL-1)
6073 zsl->header->span[j] = 0;
69d95c3e 6074 }
e3870fab 6075 zsl->header->backward = NULL;
6076 zsl->tail = NULL;
6b47e12e 6077 return zsl;
6078}
6079
fd8ccf44 6080static void zslFreeNode(zskiplistNode *node) {
6081 decrRefCount(node->obj);
ad807e6f 6082 zfree(node->forward);
69d95c3e 6083 zfree(node->span);
fd8ccf44 6084 zfree(node);
6085}
6086
6087static void zslFree(zskiplist *zsl) {
ad807e6f 6088 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 6089
ad807e6f 6090 zfree(zsl->header->forward);
69d95c3e 6091 zfree(zsl->header->span);
ad807e6f 6092 zfree(zsl->header);
fd8ccf44 6093 while(node) {
599379dd 6094 next = node->forward[0];
fd8ccf44 6095 zslFreeNode(node);
6096 node = next;
6097 }
ad807e6f 6098 zfree(zsl);
fd8ccf44 6099}
6100
6b47e12e 6101static int zslRandomLevel(void) {
6102 int level = 1;
6103 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
6104 level += 1;
10c2baa5 6105 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 6106}
6107
6108static void zslInsert(zskiplist *zsl, double score, robj *obj) {
6109 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 6110 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 6111 int i, level;
6112
6113 x = zsl->header;
6114 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
6115 /* store rank that is crossed to reach the insert position */
6116 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 6117
9d60e6e4 6118 while (x->forward[i] &&
6119 (x->forward[i]->score < score ||
6120 (x->forward[i]->score == score &&
69d95c3e 6121 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 6122 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 6123 x = x->forward[i];
69d95c3e 6124 }
6b47e12e 6125 update[i] = x;
6126 }
6b47e12e 6127 /* we assume the key is not already inside, since we allow duplicated
6128 * scores, and the re-insertion of score and redis object should never
6129 * happpen since the caller of zslInsert() should test in the hash table
6130 * if the element is already inside or not. */
6131 level = zslRandomLevel();
6132 if (level > zsl->level) {
69d95c3e 6133 for (i = zsl->level; i < level; i++) {
2b37892e 6134 rank[i] = 0;
6b47e12e 6135 update[i] = zsl->header;
2b37892e 6136 update[i]->span[i-1] = zsl->length;
69d95c3e 6137 }
6b47e12e 6138 zsl->level = level;
6139 }
6140 x = zslCreateNode(level,score,obj);
6141 for (i = 0; i < level; i++) {
6142 x->forward[i] = update[i]->forward[i];
6143 update[i]->forward[i] = x;
69d95c3e
PN
6144
6145 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
6146 if (i > 0) {
6147 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
6148 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
6149 }
6b47e12e 6150 }
69d95c3e
PN
6151
6152 /* increment span for untouched levels */
6153 for (i = level; i < zsl->level; i++) {
2b37892e 6154 update[i]->span[i-1]++;
69d95c3e
PN
6155 }
6156
bb975144 6157 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 6158 if (x->forward[0])
6159 x->forward[0]->backward = x;
6160 else
6161 zsl->tail = x;
cc812361 6162 zsl->length++;
6b47e12e 6163}
6164
84105336
PN
6165/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6166void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6167 int i;
6168 for (i = 0; i < zsl->level; i++) {
6169 if (update[i]->forward[i] == x) {
6170 if (i > 0) {
6171 update[i]->span[i-1] += x->span[i-1] - 1;
6172 }
6173 update[i]->forward[i] = x->forward[i];
6174 } else {
6175 /* invariant: i > 0, because update[0]->forward[0]
6176 * is always equal to x */
6177 update[i]->span[i-1] -= 1;
6178 }
6179 }
6180 if (x->forward[0]) {
6181 x->forward[0]->backward = x->backward;
6182 } else {
6183 zsl->tail = x->backward;
6184 }
6185 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6186 zsl->level--;
6187 zsl->length--;
6188}
6189
50c55df5 6190/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 6191static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 6192 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6193 int i;
6194
6195 x = zsl->header;
6196 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 6197 while (x->forward[i] &&
6198 (x->forward[i]->score < score ||
6199 (x->forward[i]->score == score &&
6200 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 6201 x = x->forward[i];
6202 update[i] = x;
6203 }
6204 /* We may have multiple elements with the same score, what we need
6205 * is to find the element with both the right score and object. */
6206 x = x->forward[0];
bf028098 6207 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 6208 zslDeleteNode(zsl, x, update);
9d60e6e4 6209 zslFreeNode(x);
9d60e6e4 6210 return 1;
6211 } else {
6212 return 0; /* not found */
e197b441 6213 }
6214 return 0; /* not found */
fd8ccf44 6215}
6216
1807985b 6217/* Delete all the elements with score between min and max from the skiplist.
6218 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6219 * Note that this function takes the reference to the hash table view of the
6220 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 6221static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 6222 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6223 unsigned long removed = 0;
6224 int i;
6225
6226 x = zsl->header;
6227 for (i = zsl->level-1; i >= 0; i--) {
6228 while (x->forward[i] && x->forward[i]->score < min)
6229 x = x->forward[i];
6230 update[i] = x;
6231 }
6232 /* We may have multiple elements with the same score, what we need
6233 * is to find the element with both the right score and object. */
6234 x = x->forward[0];
6235 while (x && x->score <= max) {
84105336
PN
6236 zskiplistNode *next = x->forward[0];
6237 zslDeleteNode(zsl, x, update);
1807985b 6238 dictDelete(dict,x->obj);
6239 zslFreeNode(x);
1807985b 6240 removed++;
6241 x = next;
6242 }
6243 return removed; /* not found */
6244}
1807985b 6245
9212eafd 6246/* Delete all the elements with rank between start and end from the skiplist.
2424490f 6247 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
6248static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6249 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6250 unsigned long traversed = 0, removed = 0;
6251 int i;
6252
9212eafd
PN
6253 x = zsl->header;
6254 for (i = zsl->level-1; i >= 0; i--) {
6255 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6256 traversed += i > 0 ? x->span[i-1] : 1;
6257 x = x->forward[i];
1807985b 6258 }
9212eafd
PN
6259 update[i] = x;
6260 }
6261
6262 traversed++;
6263 x = x->forward[0];
6264 while (x && traversed <= end) {
84105336
PN
6265 zskiplistNode *next = x->forward[0];
6266 zslDeleteNode(zsl, x, update);
1807985b 6267 dictDelete(dict,x->obj);
6268 zslFreeNode(x);
1807985b 6269 removed++;
9212eafd 6270 traversed++;
1807985b 6271 x = next;
6272 }
9212eafd 6273 return removed;
1807985b 6274}
6275
50c55df5 6276/* Find the first node having a score equal or greater than the specified one.
6277 * Returns NULL if there is no match. */
6278static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6279 zskiplistNode *x;
6280 int i;
6281
6282 x = zsl->header;
6283 for (i = zsl->level-1; i >= 0; i--) {
6284 while (x->forward[i] && x->forward[i]->score < score)
6285 x = x->forward[i];
6286 }
6287 /* We may have multiple elements with the same score, what we need
6288 * is to find the element with both the right score and object. */
6289 return x->forward[0];
6290}
6291
27b0ccca
PN
6292/* Find the rank for an element by both score and key.
6293 * Returns 0 when the element cannot be found, rank otherwise.
6294 * Note that the rank is 1-based due to the span of zsl->header to the
6295 * first element. */
003f0840 6296static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
27b0ccca
PN
6297 zskiplistNode *x;
6298 unsigned long rank = 0;
6299 int i;
6300
6301 x = zsl->header;
6302 for (i = zsl->level-1; i >= 0; i--) {
6303 while (x->forward[i] &&
6304 (x->forward[i]->score < score ||
6305 (x->forward[i]->score == score &&
6306 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 6307 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
6308 x = x->forward[i];
6309 }
6310
6311 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 6312 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
6313 return rank;
6314 }
6315 }
6316 return 0;
6317}
6318
e74825c2 6319/* Finds an element by its rank. The rank argument needs to be 1-based. */
003f0840 6320zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
e74825c2
PN
6321 zskiplistNode *x;
6322 unsigned long traversed = 0;
6323 int i;
6324
6325 x = zsl->header;
6326 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 6327 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6328 {
a50ea45c 6329 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
6330 x = x->forward[i];
6331 }
e74825c2
PN
6332 if (traversed == rank) {
6333 return x;
6334 }
6335 }
6336 return NULL;
6337}
6338
fd8ccf44 6339/* The actual Z-commands implementations */
6340
7db723ad 6341/* This generic command implements both ZADD and ZINCRBY.
e2665397 6342 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 6343 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 6344static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 6345 robj *zsetobj;
6346 zset *zs;
6347 double *score;
6348
5fc9229c 6349 if (isnan(scoreval)) {
6350 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6351 return;
6352 }
6353
e2665397 6354 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 6355 if (zsetobj == NULL) {
6356 zsetobj = createZsetObject();
09241813 6357 dbAdd(c->db,key,zsetobj);
fd8ccf44 6358 } else {
6359 if (zsetobj->type != REDIS_ZSET) {
6360 addReply(c,shared.wrongtypeerr);
6361 return;
6362 }
6363 }
fd8ccf44 6364 zs = zsetobj->ptr;
e2665397 6365
7db723ad 6366 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 6367 * needs to handle the two different conditions. It's all about setting
6368 * '*score', that is, the new score to set, to the right value. */
6369 score = zmalloc(sizeof(double));
6370 if (doincrement) {
6371 dictEntry *de;
6372
6373 /* Read the old score. If the element was not present starts from 0 */
6374 de = dictFind(zs->dict,ele);
6375 if (de) {
6376 double *oldscore = dictGetEntryVal(de);
6377 *score = *oldscore + scoreval;
6378 } else {
6379 *score = scoreval;
6380 }
5fc9229c 6381 if (isnan(*score)) {
6382 addReplySds(c,
6383 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6384 zfree(score);
6385 /* Note that we don't need to check if the zset may be empty and
6386 * should be removed here, as we can only obtain Nan as score if
6387 * there was already an element in the sorted set. */
6388 return;
6389 }
e2665397 6390 } else {
6391 *score = scoreval;
6392 }
6393
6394 /* What follows is a simple remove and re-insert operation that is common
7db723ad 6395 * to both ZADD and ZINCRBY... */
e2665397 6396 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 6397 /* case 1: New element */
e2665397 6398 incrRefCount(ele); /* added to hash */
6399 zslInsert(zs->zsl,*score,ele);
6400 incrRefCount(ele); /* added to skiplist */
fd8ccf44 6401 server.dirty++;
e2665397 6402 if (doincrement)
e2665397 6403 addReplyDouble(c,*score);
91d71bfc 6404 else
6405 addReply(c,shared.cone);
fd8ccf44 6406 } else {
6407 dictEntry *de;
6408 double *oldscore;
e0a62c7f 6409
fd8ccf44 6410 /* case 2: Score update operation */
e2665397 6411 de = dictFind(zs->dict,ele);
dfc5e96c 6412 redisAssert(de != NULL);
fd8ccf44 6413 oldscore = dictGetEntryVal(de);
6414 if (*score != *oldscore) {
6415 int deleted;
6416
e2665397 6417 /* Remove and insert the element in the skip list with new score */
6418 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 6419 redisAssert(deleted != 0);
e2665397 6420 zslInsert(zs->zsl,*score,ele);
6421 incrRefCount(ele);
6422 /* Update the score in the hash table */
6423 dictReplace(zs->dict,ele,score);
fd8ccf44 6424 server.dirty++;
2161a965 6425 } else {
6426 zfree(score);
fd8ccf44 6427 }
e2665397 6428 if (doincrement)
6429 addReplyDouble(c,*score);
6430 else
6431 addReply(c,shared.czero);
fd8ccf44 6432 }
6433}
6434
e2665397 6435static void zaddCommand(redisClient *c) {
6436 double scoreval;
6437
bd79a6bd 6438 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6439 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6440}
6441
7db723ad 6442static void zincrbyCommand(redisClient *c) {
e2665397 6443 double scoreval;
6444
bd79a6bd 6445 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6446 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6447}
6448
1b7106e7 6449static void zremCommand(redisClient *c) {
6450 robj *zsetobj;
6451 zset *zs;
dd88747b 6452 dictEntry *de;
6453 double *oldscore;
6454 int deleted;
1b7106e7 6455
dd88747b 6456 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6457 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 6458
dd88747b 6459 zs = zsetobj->ptr;
6460 de = dictFind(zs->dict,c->argv[2]);
6461 if (de == NULL) {
6462 addReply(c,shared.czero);
6463 return;
1b7106e7 6464 }
dd88747b 6465 /* Delete from the skiplist */
6466 oldscore = dictGetEntryVal(de);
6467 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6468 redisAssert(deleted != 0);
6469
6470 /* Delete from the hash table */
6471 dictDelete(zs->dict,c->argv[2]);
6472 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6473 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6474 server.dirty++;
6475 addReply(c,shared.cone);
1b7106e7 6476}
6477
1807985b 6478static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
6479 double min;
6480 double max;
dd88747b 6481 long deleted;
1807985b 6482 robj *zsetobj;
6483 zset *zs;
6484
bd79a6bd
PN
6485 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6486 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 6487
dd88747b 6488 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6489 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 6490
dd88747b 6491 zs = zsetobj->ptr;
6492 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6493 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6494 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6495 server.dirty += deleted;
482b672d 6496 addReplyLongLong(c,deleted);
1807985b 6497}
6498
9212eafd 6499static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
6500 long start;
6501 long end;
dd88747b 6502 int llen;
6503 long deleted;
9212eafd
PN
6504 robj *zsetobj;
6505 zset *zs;
6506
bd79a6bd
PN
6507 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6508 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6509
dd88747b 6510 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6511 checkType(c,zsetobj,REDIS_ZSET)) return;
6512 zs = zsetobj->ptr;
6513 llen = zs->zsl->length;
9212eafd 6514
dd88747b 6515 /* convert negative indexes */
6516 if (start < 0) start = llen+start;
6517 if (end < 0) end = llen+end;
6518 if (start < 0) start = 0;
6519 if (end < 0) end = 0;
9212eafd 6520
dd88747b 6521 /* indexes sanity checks */
6522 if (start > end || start >= llen) {
6523 addReply(c,shared.czero);
6524 return;
9212eafd 6525 }
dd88747b 6526 if (end >= llen) end = llen-1;
6527
6528 /* increment start and end because zsl*Rank functions
6529 * use 1-based rank */
6530 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6531 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6532 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6533 server.dirty += deleted;
482b672d 6534 addReplyLongLong(c, deleted);
9212eafd
PN
6535}
6536
8f92e768
PN
6537typedef struct {
6538 dict *dict;
6539 double weight;
6540} zsetopsrc;
6541
6542static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6543 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6544 unsigned long size1, size2;
6545 size1 = d1->dict ? dictSize(d1->dict) : 0;
6546 size2 = d2->dict ? dictSize(d2->dict) : 0;
6547 return size1 - size2;
6548}
6549
d2764cd6
PN
6550#define REDIS_AGGR_SUM 1
6551#define REDIS_AGGR_MIN 2
6552#define REDIS_AGGR_MAX 3
bc000c1d 6553#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
6554
6555inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6556 if (aggregate == REDIS_AGGR_SUM) {
6557 *target = *target + val;
6558 } else if (aggregate == REDIS_AGGR_MIN) {
6559 *target = val < *target ? val : *target;
6560 } else if (aggregate == REDIS_AGGR_MAX) {
6561 *target = val > *target ? val : *target;
6562 } else {
6563 /* safety net */
f83c6cb5 6564 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
6565 }
6566}
6567
2830ca53 6568static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 6569 int i, j, setnum;
d2764cd6 6570 int aggregate = REDIS_AGGR_SUM;
8f92e768 6571 zsetopsrc *src;
2830ca53
PN
6572 robj *dstobj;
6573 zset *dstzset;
b287c9bb
PN
6574 dictIterator *di;
6575 dictEntry *de;
6576
bc000c1d
JC
6577 /* expect setnum input keys to be given */
6578 setnum = atoi(c->argv[2]->ptr);
6579 if (setnum < 1) {
5d373da9 6580 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 6581 return;
b287c9bb 6582 }
2830ca53
PN
6583
6584 /* test if the expected number of keys would overflow */
bc000c1d 6585 if (3+setnum > c->argc) {
b287c9bb
PN
6586 addReply(c,shared.syntaxerr);
6587 return;
6588 }
6589
2830ca53 6590 /* read keys to be used for input */
bc000c1d
JC
6591 src = zmalloc(sizeof(zsetopsrc) * setnum);
6592 for (i = 0, j = 3; i < setnum; i++, j++) {
6593 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6594 if (!obj) {
8f92e768 6595 src[i].dict = NULL;
b287c9bb 6596 } else {
bc000c1d
JC
6597 if (obj->type == REDIS_ZSET) {
6598 src[i].dict = ((zset*)obj->ptr)->dict;
6599 } else if (obj->type == REDIS_SET) {
6600 src[i].dict = (obj->ptr);
6601 } else {
8f92e768 6602 zfree(src);
b287c9bb
PN
6603 addReply(c,shared.wrongtypeerr);
6604 return;
6605 }
b287c9bb 6606 }
2830ca53
PN
6607
6608 /* default all weights to 1 */
8f92e768 6609 src[i].weight = 1.0;
b287c9bb
PN
6610 }
6611
2830ca53
PN
6612 /* parse optional extra arguments */
6613 if (j < c->argc) {
d2764cd6 6614 int remaining = c->argc - j;
b287c9bb 6615
2830ca53 6616 while (remaining) {
bc000c1d 6617 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6618 j++; remaining--;
bc000c1d 6619 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6620 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6621 return;
2830ca53 6622 }
d2764cd6
PN
6623 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6624 j++; remaining--;
6625 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6626 aggregate = REDIS_AGGR_SUM;
6627 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6628 aggregate = REDIS_AGGR_MIN;
6629 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6630 aggregate = REDIS_AGGR_MAX;
6631 } else {
6632 zfree(src);
6633 addReply(c,shared.syntaxerr);
6634 return;
6635 }
6636 j++; remaining--;
2830ca53 6637 } else {
8f92e768 6638 zfree(src);
2830ca53
PN
6639 addReply(c,shared.syntaxerr);
6640 return;
6641 }
6642 }
6643 }
b287c9bb 6644
d2764cd6
PN
6645 /* sort sets from the smallest to largest, this will improve our
6646 * algorithm's performance */
bc000c1d 6647 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6648
2830ca53
PN
6649 dstobj = createZsetObject();
6650 dstzset = dstobj->ptr;
6651
6652 if (op == REDIS_OP_INTER) {
8f92e768
PN
6653 /* skip going over all entries if the smallest zset is NULL or empty */
6654 if (src[0].dict && dictSize(src[0].dict) > 0) {
6655 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6656 * from small to large, all src[i > 0].dict are non-empty too */
6657 di = dictGetIterator(src[0].dict);
2830ca53 6658 while((de = dictNext(di)) != NULL) {
d2764cd6 6659 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6660 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6661
bc000c1d 6662 for (j = 1; j < setnum; j++) {
d2764cd6 6663 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6664 if (other) {
bc000c1d 6665 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6666 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6667 } else {
6668 break;
6669 }
6670 }
b287c9bb 6671
2830ca53 6672 /* skip entry when not present in every source dict */
bc000c1d 6673 if (j != setnum) {
2830ca53
PN
6674 zfree(score);
6675 } else {
6676 robj *o = dictGetEntryKey(de);
6677 dictAdd(dstzset->dict,o,score);
6678 incrRefCount(o); /* added to dictionary */
6679 zslInsert(dstzset->zsl,*score,o);
6680 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6681 }
6682 }
2830ca53
PN
6683 dictReleaseIterator(di);
6684 }
6685 } else if (op == REDIS_OP_UNION) {
bc000c1d 6686 for (i = 0; i < setnum; i++) {
8f92e768 6687 if (!src[i].dict) continue;
2830ca53 6688
8f92e768 6689 di = dictGetIterator(src[i].dict);
2830ca53
PN
6690 while((de = dictNext(di)) != NULL) {
6691 /* skip key when already processed */
6692 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6693
d2764cd6 6694 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6695 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6696
d2764cd6
PN
6697 /* because the zsets are sorted by size, its only possible
6698 * for sets at larger indices to hold this entry */
bc000c1d 6699 for (j = (i+1); j < setnum; j++) {
d2764cd6 6700 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6701 if (other) {
bc000c1d 6702 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6703 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6704 }
6705 }
b287c9bb 6706
2830ca53
PN
6707 robj *o = dictGetEntryKey(de);
6708 dictAdd(dstzset->dict,o,score);
6709 incrRefCount(o); /* added to dictionary */
6710 zslInsert(dstzset->zsl,*score,o);
6711 incrRefCount(o); /* added to skiplist */
6712 }
6713 dictReleaseIterator(di);
b287c9bb 6714 }
2830ca53
PN
6715 } else {
6716 /* unknown operator */
6717 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6718 }
6719
09241813 6720 dbDelete(c->db,dstkey);
3ea27d37 6721 if (dstzset->zsl->length) {
09241813 6722 dbAdd(c->db,dstkey,dstobj);
482b672d 6723 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6724 server.dirty++;
6725 } else {
8bca8773 6726 decrRefCount(dstobj);
3ea27d37 6727 addReply(c, shared.czero);
6728 }
8f92e768 6729 zfree(src);
b287c9bb
PN
6730}
6731
5d373da9 6732static void zunionstoreCommand(redisClient *c) {
2830ca53 6733 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6734}
6735
5d373da9 6736static void zinterstoreCommand(redisClient *c) {
2830ca53 6737 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6738}
6739
e3870fab 6740static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6741 robj *o;
bbe025e0
AM
6742 long start;
6743 long end;
752da584 6744 int withscores = 0;
dd88747b 6745 int llen;
6746 int rangelen, j;
6747 zset *zsetobj;
6748 zskiplist *zsl;
6749 zskiplistNode *ln;
6750 robj *ele;
752da584 6751
bd79a6bd
PN
6752 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6753 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6754
752da584 6755 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6756 withscores = 1;
6757 } else if (c->argc >= 5) {
6758 addReply(c,shared.syntaxerr);
6759 return;
6760 }
cc812361 6761
4e27f268 6762 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6763 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6764 zsetobj = o->ptr;
6765 zsl = zsetobj->zsl;
6766 llen = zsl->length;
cc812361 6767
dd88747b 6768 /* convert negative indexes */
6769 if (start < 0) start = llen+start;
6770 if (end < 0) end = llen+end;
6771 if (start < 0) start = 0;
6772 if (end < 0) end = 0;
cc812361 6773
dd88747b 6774 /* indexes sanity checks */
6775 if (start > end || start >= llen) {
6776 /* Out of range start or start > end result in empty list */
6777 addReply(c,shared.emptymultibulk);
6778 return;
6779 }
6780 if (end >= llen) end = llen-1;
6781 rangelen = (end-start)+1;
cc812361 6782
dd88747b 6783 /* check if starting point is trivial, before searching
6784 * the element in log(N) time */
6785 if (reverse) {
003f0840 6786 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
dd88747b 6787 } else {
6788 ln = start == 0 ?
003f0840 6789 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
dd88747b 6790 }
cc812361 6791
dd88747b 6792 /* Return the result in form of a multi-bulk reply */
6793 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6794 withscores ? (rangelen*2) : rangelen));
6795 for (j = 0; j < rangelen; j++) {
6796 ele = ln->obj;
6797 addReplyBulk(c,ele);
6798 if (withscores)
6799 addReplyDouble(c,ln->score);
6800 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6801 }
6802}
6803
e3870fab 6804static void zrangeCommand(redisClient *c) {
6805 zrangeGenericCommand(c,0);
6806}
6807
6808static void zrevrangeCommand(redisClient *c) {
6809 zrangeGenericCommand(c,1);
6810}
6811
f44dd428 6812/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6813 * If justcount is non-zero, just the count is returned. */
6814static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6815 robj *o;
f44dd428 6816 double min, max;
6817 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6818 int offset = 0, limit = -1;
0500ef27
SH
6819 int withscores = 0;
6820 int badsyntax = 0;
6821
f44dd428 6822 /* Parse the min-max interval. If one of the values is prefixed
6823 * by the "(" character, it's considered "open". For instance
6824 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6825 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6826 if (((char*)c->argv[2]->ptr)[0] == '(') {
6827 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6828 minex = 1;
6829 } else {
6830 min = strtod(c->argv[2]->ptr,NULL);
6831 }
6832 if (((char*)c->argv[3]->ptr)[0] == '(') {
6833 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6834 maxex = 1;
6835 } else {
6836 max = strtod(c->argv[3]->ptr,NULL);
6837 }
6838
6839 /* Parse "WITHSCORES": note that if the command was called with
6840 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6841 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6842 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6843 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6844 withscores = 1;
6845 else
6846 badsyntax = 1;
0500ef27 6847 }
3a3978b1 6848 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6849 badsyntax = 1;
0500ef27 6850 if (badsyntax) {
454d4e43 6851 addReplySds(c,
6852 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6853 return;
0500ef27
SH
6854 }
6855
f44dd428 6856 /* Parse "LIMIT" */
0500ef27 6857 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6858 addReply(c,shared.syntaxerr);
6859 return;
0500ef27 6860 } else if (c->argc == (7 + withscores)) {
80181f78 6861 offset = atoi(c->argv[5]->ptr);
6862 limit = atoi(c->argv[6]->ptr);
0b13687c 6863 if (offset < 0) offset = 0;
80181f78 6864 }
50c55df5 6865
f44dd428 6866 /* Ok, lookup the key and get the range */
50c55df5 6867 o = lookupKeyRead(c->db,c->argv[1]);
6868 if (o == NULL) {
4e27f268 6869 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6870 } else {
6871 if (o->type != REDIS_ZSET) {
6872 addReply(c,shared.wrongtypeerr);
6873 } else {
6874 zset *zsetobj = o->ptr;
6875 zskiplist *zsl = zsetobj->zsl;
6876 zskiplistNode *ln;
f44dd428 6877 robj *ele, *lenobj = NULL;
6878 unsigned long rangelen = 0;
50c55df5 6879
f44dd428 6880 /* Get the first node with the score >= min, or with
6881 * score > min if 'minex' is true. */
50c55df5 6882 ln = zslFirstWithScore(zsl,min);
f44dd428 6883 while (minex && ln && ln->score == min) ln = ln->forward[0];
6884
50c55df5 6885 if (ln == NULL) {
6886 /* No element matching the speciifed interval */
f44dd428 6887 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6888 return;
6889 }
6890
6891 /* We don't know in advance how many matching elements there
6892 * are in the list, so we push this object that will represent
6893 * the multi-bulk length in the output buffer, and will "fix"
6894 * it later */
f44dd428 6895 if (!justcount) {
6896 lenobj = createObject(REDIS_STRING,NULL);
6897 addReply(c,lenobj);
6898 decrRefCount(lenobj);
6899 }
50c55df5 6900
f44dd428 6901 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6902 if (offset) {
6903 offset--;
6904 ln = ln->forward[0];
6905 continue;
6906 }
6907 if (limit == 0) break;
f44dd428 6908 if (!justcount) {
6909 ele = ln->obj;
dd88747b 6910 addReplyBulk(c,ele);
f44dd428 6911 if (withscores)
6912 addReplyDouble(c,ln->score);
6913 }
50c55df5 6914 ln = ln->forward[0];
6915 rangelen++;
80181f78 6916 if (limit > 0) limit--;
50c55df5 6917 }
f44dd428 6918 if (justcount) {
482b672d 6919 addReplyLongLong(c,(long)rangelen);
f44dd428 6920 } else {
6921 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6922 withscores ? (rangelen*2) : rangelen);
6923 }
50c55df5 6924 }
6925 }
6926}
6927
f44dd428 6928static void zrangebyscoreCommand(redisClient *c) {
6929 genericZrangebyscoreCommand(c,0);
6930}
6931
6932static void zcountCommand(redisClient *c) {
6933 genericZrangebyscoreCommand(c,1);
6934}
6935
3c41331e 6936static void zcardCommand(redisClient *c) {
e197b441 6937 robj *o;
6938 zset *zs;
dd88747b 6939
6940 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6941 checkType(c,o,REDIS_ZSET)) return;
6942
6943 zs = o->ptr;
6944 addReplyUlong(c,zs->zsl->length);
e197b441 6945}
6946
6e333bbe 6947static void zscoreCommand(redisClient *c) {
6948 robj *o;
6949 zset *zs;
dd88747b 6950 dictEntry *de;
6951
6952 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6953 checkType(c,o,REDIS_ZSET)) return;
6954
6955 zs = o->ptr;
6956 de = dictFind(zs->dict,c->argv[2]);
6957 if (!de) {
96d8b4ee 6958 addReply(c,shared.nullbulk);
6e333bbe 6959 } else {
dd88747b 6960 double *score = dictGetEntryVal(de);
6e333bbe 6961
dd88747b 6962 addReplyDouble(c,*score);
6e333bbe 6963 }
6964}
6965
798d9e55 6966static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6967 robj *o;
dd88747b 6968 zset *zs;
6969 zskiplist *zsl;
6970 dictEntry *de;
6971 unsigned long rank;
6972 double *score;
6973
6974 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6975 checkType(c,o,REDIS_ZSET)) return;
6976
6977 zs = o->ptr;
6978 zsl = zs->zsl;
6979 de = dictFind(zs->dict,c->argv[2]);
6980 if (!de) {
69d95c3e
PN
6981 addReply(c,shared.nullbulk);
6982 return;
6983 }
69d95c3e 6984
dd88747b 6985 score = dictGetEntryVal(de);
003f0840 6986 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
dd88747b 6987 if (rank) {
6988 if (reverse) {
482b672d 6989 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6990 } else {
482b672d 6991 addReplyLongLong(c, rank-1);
69d95c3e 6992 }
dd88747b 6993 } else {
6994 addReply(c,shared.nullbulk);
978c2c94 6995 }
6996}
6997
798d9e55
PN
6998static void zrankCommand(redisClient *c) {
6999 zrankGenericCommand(c, 0);
7000}
7001
7002static void zrevrankCommand(redisClient *c) {
7003 zrankGenericCommand(c, 1);
7004}
7005
7fb16bac
PN
7006/* ========================= Hashes utility functions ======================= */
7007#define REDIS_HASH_KEY 1
7008#define REDIS_HASH_VALUE 2
978c2c94 7009
7fb16bac
PN
7010/* Check the length of a number of objects to see if we need to convert a
7011 * zipmap to a real hash. Note that we only check string encoded objects
7012 * as their string length can be queried in constant time. */
d1578a33 7013static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
7fb16bac
PN
7014 int i;
7015 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 7016
7fb16bac
PN
7017 for (i = start; i <= end; i++) {
7018 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
7019 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
7020 {
7021 convertToRealHash(subject);
978c2c94 7022 return;
7023 }
7024 }
7fb16bac 7025}
bae2c7ec 7026
97224de7 7027/* Encode given objects in-place when the hash uses a dict. */
d1578a33 7028static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
97224de7 7029 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
7030 if (o1) *o1 = tryObjectEncoding(*o1);
7031 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
7032 }
7033}
7034
7fb16bac 7035/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
7036 * object or NULL if the value cannot be found. The refcount of the object
7037 * is always increased by 1 when the value was found. */
d1578a33 7038static robj *hashTypeGet(robj *o, robj *key) {
7fb16bac 7039 robj *value = NULL;
978c2c94 7040 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
7041 unsigned char *v;
7042 unsigned int vlen;
7043 key = getDecodedObject(key);
7044 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
7045 value = createStringObject((char*)v,vlen);
7046 }
7047 decrRefCount(key);
7048 } else {
7049 dictEntry *de = dictFind(o->ptr,key);
7050 if (de != NULL) {
7051 value = dictGetEntryVal(de);
a3f3af86 7052 incrRefCount(value);
7fb16bac
PN
7053 }
7054 }
7055 return value;
7056}
978c2c94 7057
7fb16bac
PN
7058/* Test if the key exists in the given hash. Returns 1 if the key
7059 * exists and 0 when it doesn't. */
d1578a33 7060static int hashTypeExists(robj *o, robj *key) {
7fb16bac
PN
7061 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7062 key = getDecodedObject(key);
7063 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
7064 decrRefCount(key);
7065 return 1;
7066 }
7067 decrRefCount(key);
7068 } else {
7069 if (dictFind(o->ptr,key) != NULL) {
7070 return 1;
7071 }
7072 }
7073 return 0;
7074}
bae2c7ec 7075
7fb16bac
PN
7076/* Add an element, discard the old if the key already exists.
7077 * Return 0 on insert and 1 on update. */
d1578a33 7078static int hashTypeSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
7079 int update = 0;
7080 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7081 key = getDecodedObject(key);
7082 value = getDecodedObject(value);
7083 o->ptr = zipmapSet(o->ptr,
7084 key->ptr,sdslen(key->ptr),
7085 value->ptr,sdslen(value->ptr), &update);
7086 decrRefCount(key);
7087 decrRefCount(value);
7088
7089 /* Check if the zipmap needs to be upgraded to a real hash table */
7090 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 7091 convertToRealHash(o);
978c2c94 7092 } else {
7fb16bac
PN
7093 if (dictReplace(o->ptr,key,value)) {
7094 /* Insert */
7095 incrRefCount(key);
978c2c94 7096 } else {
7fb16bac 7097 /* Update */
978c2c94 7098 update = 1;
7099 }
7fb16bac 7100 incrRefCount(value);
978c2c94 7101 }
7fb16bac 7102 return update;
978c2c94 7103}
7104
7fb16bac
PN
7105/* Delete an element from a hash.
7106 * Return 1 on deleted and 0 on not found. */
d1578a33 7107static int hashTypeDelete(robj *o, robj *key) {
7fb16bac
PN
7108 int deleted = 0;
7109 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7110 key = getDecodedObject(key);
7111 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
7112 decrRefCount(key);
7113 } else {
7114 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
7115 /* Always check if the dictionary needs a resize after a delete. */
7116 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 7117 }
7fb16bac
PN
7118 return deleted;
7119}
d33278d1 7120
7fb16bac 7121/* Return the number of elements in a hash. */
d1578a33 7122static unsigned long hashTypeLength(robj *o) {
7fb16bac
PN
7123 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
7124 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
7125}
7126
7127/* Structure to hold hash iteration abstration. Note that iteration over
7128 * hashes involves both fields and values. Because it is possible that
7129 * not both are required, store pointers in the iterator to avoid
7130 * unnecessary memory allocation for fields/values. */
7131typedef struct {
7132 int encoding;
7133 unsigned char *zi;
7134 unsigned char *zk, *zv;
7135 unsigned int zklen, zvlen;
7136
7137 dictIterator *di;
7138 dictEntry *de;
d1578a33 7139} hashTypeIterator;
7fb16bac 7140
d1578a33
PN
7141static hashTypeIterator *hashTypeInitIterator(robj *subject) {
7142 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
7fb16bac
PN
7143 hi->encoding = subject->encoding;
7144 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7145 hi->zi = zipmapRewind(subject->ptr);
7146 } else if (hi->encoding == REDIS_ENCODING_HT) {
7147 hi->di = dictGetIterator(subject->ptr);
d33278d1 7148 } else {
7fb16bac 7149 redisAssert(NULL);
d33278d1 7150 }
c44d3b56 7151 return hi;
7fb16bac 7152}
d33278d1 7153
d1578a33 7154static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7fb16bac
PN
7155 if (hi->encoding == REDIS_ENCODING_HT) {
7156 dictReleaseIterator(hi->di);
d33278d1 7157 }
c44d3b56 7158 zfree(hi);
7fb16bac 7159}
d33278d1 7160
7fb16bac
PN
7161/* Move to the next entry in the hash. Return REDIS_OK when the next entry
7162 * could be found and REDIS_ERR when the iterator reaches the end. */
d1578a33 7163static int hashTypeNext(hashTypeIterator *hi) {
7fb16bac
PN
7164 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7165 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7166 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7167 } else {
7168 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7169 }
7170 return REDIS_OK;
7171}
d33278d1 7172
0c390abc 7173/* Get key or value object at current iteration position.
a3f3af86 7174 * This increases the refcount of the field object by 1. */
d1578a33 7175static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7fb16bac
PN
7176 robj *o;
7177 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7178 if (what & REDIS_HASH_KEY) {
7179 o = createStringObject((char*)hi->zk,hi->zklen);
7180 } else {
7181 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 7182 }
d33278d1 7183 } else {
7fb16bac
PN
7184 if (what & REDIS_HASH_KEY) {
7185 o = dictGetEntryKey(hi->de);
7186 } else {
7187 o = dictGetEntryVal(hi->de);
d33278d1 7188 }
a3f3af86 7189 incrRefCount(o);
d33278d1 7190 }
7fb16bac 7191 return o;
d33278d1
PN
7192}
7193
d1578a33 7194static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7fb16bac 7195 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
7196 if (o == NULL) {
7197 o = createHashObject();
09241813 7198 dbAdd(c->db,key,o);
01426b05
PN
7199 } else {
7200 if (o->type != REDIS_HASH) {
7201 addReply(c,shared.wrongtypeerr);
7fb16bac 7202 return NULL;
01426b05
PN
7203 }
7204 }
7fb16bac
PN
7205 return o;
7206}
01426b05 7207
7fb16bac
PN
7208/* ============================= Hash commands ============================== */
7209static void hsetCommand(redisClient *c) {
6e9e463f 7210 int update;
7fb16bac 7211 robj *o;
bbe025e0 7212
d1578a33
PN
7213 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7214 hashTypeTryConversion(o,c->argv,2,3);
7215 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7216 update = hashTypeSet(o,c->argv[2],c->argv[3]);
6e9e463f 7217 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
7218 server.dirty++;
7219}
01426b05 7220
1f1c7695
PN
7221static void hsetnxCommand(redisClient *c) {
7222 robj *o;
d1578a33
PN
7223 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7224 hashTypeTryConversion(o,c->argv,2,3);
1f1c7695 7225
d1578a33 7226 if (hashTypeExists(o, c->argv[2])) {
1f1c7695 7227 addReply(c, shared.czero);
01426b05 7228 } else {
d1578a33
PN
7229 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7230 hashTypeSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
7231 addReply(c, shared.cone);
7232 server.dirty++;
7233 }
7234}
01426b05 7235
7fb16bac
PN
7236static void hmsetCommand(redisClient *c) {
7237 int i;
7238 robj *o;
01426b05 7239
7fb16bac
PN
7240 if ((c->argc % 2) == 1) {
7241 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7242 return;
7243 }
01426b05 7244
d1578a33
PN
7245 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7246 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7fb16bac 7247 for (i = 2; i < c->argc; i += 2) {
d1578a33
PN
7248 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7249 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
7250 }
7251 addReply(c, shared.ok);
edc2f63a 7252 server.dirty++;
7fb16bac
PN
7253}
7254
7255static void hincrbyCommand(redisClient *c) {
7256 long long value, incr;
7257 robj *o, *current, *new;
7258
bd79a6bd 7259 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
d1578a33
PN
7260 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7261 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
946342c1
PN
7262 if (getLongLongFromObjectOrReply(c,current,&value,
7263 "hash value is not an integer") != REDIS_OK) {
7264 decrRefCount(current);
7265 return;
7266 }
a3f3af86 7267 decrRefCount(current);
7fb16bac
PN
7268 } else {
7269 value = 0;
01426b05
PN
7270 }
7271
7fb16bac 7272 value += incr;
3f973463 7273 new = createStringObjectFromLongLong(value);
d1578a33
PN
7274 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7275 hashTypeSet(o,c->argv[2],new);
7fb16bac
PN
7276 decrRefCount(new);
7277 addReplyLongLong(c,value);
01426b05 7278 server.dirty++;
01426b05
PN
7279}
7280
978c2c94 7281static void hgetCommand(redisClient *c) {
7fb16bac 7282 robj *o, *value;
dd88747b 7283 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7284 checkType(c,o,REDIS_HASH)) return;
7285
d1578a33 7286 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7fb16bac 7287 addReplyBulk(c,value);
a3f3af86 7288 decrRefCount(value);
dd88747b 7289 } else {
7fb16bac 7290 addReply(c,shared.nullbulk);
69d95c3e 7291 }
69d95c3e
PN
7292}
7293
09aeb579
PN
7294static void hmgetCommand(redisClient *c) {
7295 int i;
7fb16bac
PN
7296 robj *o, *value;
7297 o = lookupKeyRead(c->db,c->argv[1]);
7298 if (o != NULL && o->type != REDIS_HASH) {
7299 addReply(c,shared.wrongtypeerr);
09aeb579
PN
7300 }
7301
7fb16bac
PN
7302 /* Note the check for o != NULL happens inside the loop. This is
7303 * done because objects that cannot be found are considered to be
7304 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 7305 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac 7306 for (i = 2; i < c->argc; i++) {
d1578a33 7307 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7fb16bac 7308 addReplyBulk(c,value);
a3f3af86 7309 decrRefCount(value);
7fb16bac
PN
7310 } else {
7311 addReply(c,shared.nullbulk);
09aeb579
PN
7312 }
7313 }
7314}
7315
07efaf74 7316static void hdelCommand(redisClient *c) {
dd88747b 7317 robj *o;
dd88747b 7318 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7319 checkType(c,o,REDIS_HASH)) return;
07efaf74 7320
d1578a33
PN
7321 if (hashTypeDelete(o,c->argv[2])) {
7322 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7fb16bac
PN
7323 addReply(c,shared.cone);
7324 server.dirty++;
dd88747b 7325 } else {
7fb16bac 7326 addReply(c,shared.czero);
07efaf74 7327 }
7328}
7329
92b27fe9 7330static void hlenCommand(redisClient *c) {
7331 robj *o;
dd88747b 7332 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 7333 checkType(c,o,REDIS_HASH)) return;
7334
d1578a33 7335 addReplyUlong(c,hashTypeLength(o));
92b27fe9 7336}
7337
78409a0f 7338static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 7339 robj *o, *lenobj, *obj;
78409a0f 7340 unsigned long count = 0;
d1578a33 7341 hashTypeIterator *hi;
78409a0f 7342
4e27f268 7343 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 7344 || checkType(c,o,REDIS_HASH)) return;
7345
7346 lenobj = createObject(REDIS_STRING,NULL);
7347 addReply(c,lenobj);
7348 decrRefCount(lenobj);
7349
d1578a33
PN
7350 hi = hashTypeInitIterator(o);
7351 while (hashTypeNext(hi) != REDIS_ERR) {
7fb16bac 7352 if (flags & REDIS_HASH_KEY) {
d1578a33 7353 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7fb16bac 7354 addReplyBulk(c,obj);
a3f3af86 7355 decrRefCount(obj);
7fb16bac 7356 count++;
78409a0f 7357 }
7fb16bac 7358 if (flags & REDIS_HASH_VALUE) {
d1578a33 7359 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 7360 addReplyBulk(c,obj);
a3f3af86 7361 decrRefCount(obj);
7fb16bac 7362 count++;
78409a0f 7363 }
78409a0f 7364 }
d1578a33 7365 hashTypeReleaseIterator(hi);
7fb16bac 7366
78409a0f 7367 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7368}
7369
7370static void hkeysCommand(redisClient *c) {
7fb16bac 7371 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 7372}
7373
7374static void hvalsCommand(redisClient *c) {
7fb16bac 7375 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 7376}
7377
7378static void hgetallCommand(redisClient *c) {
7fb16bac 7379 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 7380}
7381
a86f14b1 7382static void hexistsCommand(redisClient *c) {
7383 robj *o;
a86f14b1 7384 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7385 checkType(c,o,REDIS_HASH)) return;
7386
d1578a33 7387 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 7388}
7389
ada386b2 7390static void convertToRealHash(robj *o) {
7391 unsigned char *key, *val, *p, *zm = o->ptr;
7392 unsigned int klen, vlen;
7393 dict *dict = dictCreate(&hashDictType,NULL);
7394
7395 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7396 p = zipmapRewind(zm);
7397 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7398 robj *keyobj, *valobj;
7399
7400 keyobj = createStringObject((char*)key,klen);
7401 valobj = createStringObject((char*)val,vlen);
05df7621 7402 keyobj = tryObjectEncoding(keyobj);
7403 valobj = tryObjectEncoding(valobj);
ada386b2 7404 dictAdd(dict,keyobj,valobj);
7405 }
7406 o->encoding = REDIS_ENCODING_HT;
7407 o->ptr = dict;
7408 zfree(zm);
7409}
7410
6b47e12e 7411/* ========================= Non type-specific commands ==================== */
7412
ed9b544e 7413static void flushdbCommand(redisClient *c) {
ca37e9cd 7414 server.dirty += dictSize(c->db->dict);
9b30e1a2 7415 touchWatchedKeysOnFlush(c->db->id);
3305306f 7416 dictEmpty(c->db->dict);
7417 dictEmpty(c->db->expires);
ed9b544e 7418 addReply(c,shared.ok);
ed9b544e 7419}
7420
7421static void flushallCommand(redisClient *c) {
9b30e1a2 7422 touchWatchedKeysOnFlush(-1);
ca37e9cd 7423 server.dirty += emptyDb();
ed9b544e 7424 addReply(c,shared.ok);
500ece7c 7425 if (server.bgsavechildpid != -1) {
7426 kill(server.bgsavechildpid,SIGKILL);
7427 rdbRemoveTempFile(server.bgsavechildpid);
7428 }
f78fd11b 7429 rdbSave(server.dbfilename);
ca37e9cd 7430 server.dirty++;
ed9b544e 7431}
7432
56906eef 7433static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 7434 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 7435 so->type = type;
7436 so->pattern = pattern;
7437 return so;
7438}
7439
7440/* Return the value associated to the key with a name obtained
55017f9d
PN
7441 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7442 * The returned object will always have its refcount increased by 1
7443 * when it is non-NULL. */
56906eef 7444static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 7445 char *p, *f;
ed9b544e 7446 sds spat, ssub;
6d7d1370
PN
7447 robj keyobj, fieldobj, *o;
7448 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 7449 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7450 struct {
f1017b3f 7451 long len;
7452 long free;
ed9b544e 7453 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 7454 } keyname, fieldname;
ed9b544e 7455
28173a49 7456 /* If the pattern is "#" return the substitution object itself in order
7457 * to implement the "SORT ... GET #" feature. */
7458 spat = pattern->ptr;
7459 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 7460 incrRefCount(subst);
28173a49 7461 return subst;
7462 }
7463
7464 /* The substitution object may be specially encoded. If so we create
9d65a1bb 7465 * a decoded object on the fly. Otherwise getDecodedObject will just
7466 * increment the ref count, that we'll decrement later. */
7467 subst = getDecodedObject(subst);
942a3961 7468
ed9b544e 7469 ssub = subst->ptr;
7470 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7471 p = strchr(spat,'*');
ed5a857a 7472 if (!p) {
7473 decrRefCount(subst);
7474 return NULL;
7475 }
ed9b544e 7476
6d7d1370
PN
7477 /* Find out if we're dealing with a hash dereference. */
7478 if ((f = strstr(p+1, "->")) != NULL) {
7479 fieldlen = sdslen(spat)-(f-spat);
7480 /* this also copies \0 character */
7481 memcpy(fieldname.buf,f+2,fieldlen-1);
7482 fieldname.len = fieldlen-2;
7483 } else {
7484 fieldlen = 0;
7485 }
7486
ed9b544e 7487 prefixlen = p-spat;
7488 sublen = sdslen(ssub);
6d7d1370 7489 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 7490 memcpy(keyname.buf,spat,prefixlen);
7491 memcpy(keyname.buf+prefixlen,ssub,sublen);
7492 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7493 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7494 keyname.len = prefixlen+sublen+postfixlen;
942a3961 7495 decrRefCount(subst);
7496
6d7d1370
PN
7497 /* Lookup substituted key */
7498 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7499 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
7500 if (o == NULL) return NULL;
7501
7502 if (fieldlen > 0) {
7503 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 7504
705dad38
PN
7505 /* Retrieve value from hash by the field name. This operation
7506 * already increases the refcount of the returned object. */
6d7d1370 7507 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
d1578a33 7508 o = hashTypeGet(o, &fieldobj);
705dad38 7509 } else {
55017f9d 7510 if (o->type != REDIS_STRING) return NULL;
b6f07345 7511
705dad38
PN
7512 /* Every object that this function returns needs to have its refcount
7513 * increased. sortCommand decreases it again. */
7514 incrRefCount(o);
6d7d1370
PN
7515 }
7516
7517 return o;
ed9b544e 7518}
7519
7520/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7521 * the additional parameter is not standard but a BSD-specific we have to
7522 * pass sorting parameters via the global 'server' structure */
7523static int sortCompare(const void *s1, const void *s2) {
7524 const redisSortObject *so1 = s1, *so2 = s2;
7525 int cmp;
7526
7527 if (!server.sort_alpha) {
7528 /* Numeric sorting. Here it's trivial as we precomputed scores */
7529 if (so1->u.score > so2->u.score) {
7530 cmp = 1;
7531 } else if (so1->u.score < so2->u.score) {
7532 cmp = -1;
7533 } else {
7534 cmp = 0;
7535 }
7536 } else {
7537 /* Alphanumeric sorting */
7538 if (server.sort_bypattern) {
7539 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7540 /* At least one compare object is NULL */
7541 if (so1->u.cmpobj == so2->u.cmpobj)
7542 cmp = 0;
7543 else if (so1->u.cmpobj == NULL)
7544 cmp = -1;
7545 else
7546 cmp = 1;
7547 } else {
7548 /* We have both the objects, use strcoll */
7549 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7550 }
7551 } else {
08ee9b57 7552 /* Compare elements directly. */
7553 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 7554 }
7555 }
7556 return server.sort_desc ? -cmp : cmp;
7557}
7558
7559/* The SORT command is the most complex command in Redis. Warning: this code
7560 * is optimized for speed and a bit less for readability */
7561static void sortCommand(redisClient *c) {
ed9b544e 7562 list *operations;
a03611e1 7563 unsigned int outputlen = 0;
ed9b544e 7564 int desc = 0, alpha = 0;
7565 int limit_start = 0, limit_count = -1, start, end;
7566 int j, dontsort = 0, vectorlen;
7567 int getop = 0; /* GET operation counter */
443c6409 7568 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 7569 redisSortObject *vector; /* Resulting vector to sort */
7570
7571 /* Lookup the key to sort. It must be of the right types */
3305306f 7572 sortval = lookupKeyRead(c->db,c->argv[1]);
7573 if (sortval == NULL) {
4e27f268 7574 addReply(c,shared.emptymultibulk);
ed9b544e 7575 return;
7576 }
a5eb649b 7577 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7578 sortval->type != REDIS_ZSET)
7579 {
c937aa89 7580 addReply(c,shared.wrongtypeerr);
ed9b544e 7581 return;
7582 }
7583
7584 /* Create a list of operations to perform for every sorted element.
7585 * Operations can be GET/DEL/INCR/DECR */
7586 operations = listCreate();
092dac2a 7587 listSetFreeMethod(operations,zfree);
ed9b544e 7588 j = 2;
7589
7590 /* Now we need to protect sortval incrementing its count, in the future
7591 * SORT may have options able to overwrite/delete keys during the sorting
7592 * and the sorted key itself may get destroied */
7593 incrRefCount(sortval);
7594
7595 /* The SORT command has an SQL-alike syntax, parse it */
7596 while(j < c->argc) {
7597 int leftargs = c->argc-j-1;
7598 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7599 desc = 0;
7600 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7601 desc = 1;
7602 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7603 alpha = 1;
7604 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7605 limit_start = atoi(c->argv[j+1]->ptr);
7606 limit_count = atoi(c->argv[j+2]->ptr);
7607 j+=2;
443c6409 7608 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7609 storekey = c->argv[j+1];
7610 j++;
ed9b544e 7611 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7612 sortby = c->argv[j+1];
7613 /* If the BY pattern does not contain '*', i.e. it is constant,
7614 * we don't need to sort nor to lookup the weight keys. */
7615 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7616 j++;
7617 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7618 listAddNodeTail(operations,createSortOperation(
7619 REDIS_SORT_GET,c->argv[j+1]));
7620 getop++;
7621 j++;
ed9b544e 7622 } else {
7623 decrRefCount(sortval);
7624 listRelease(operations);
c937aa89 7625 addReply(c,shared.syntaxerr);
ed9b544e 7626 return;
7627 }
7628 j++;
7629 }
7630
7631 /* Load the sorting vector with all the objects to sort */
a5eb649b 7632 switch(sortval->type) {
003f0840 7633 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
a5eb649b 7634 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7635 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7636 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7637 }
ed9b544e 7638 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7639 j = 0;
a5eb649b 7640
ed9b544e 7641 if (sortval->type == REDIS_LIST) {
003f0840
PN
7642 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7643 listTypeEntry entry;
7644 while(listTypeNext(li,&entry)) {
7645 vector[j].obj = listTypeGet(&entry);
ed9b544e 7646 vector[j].u.score = 0;
7647 vector[j].u.cmpobj = NULL;
ed9b544e 7648 j++;
7649 }
003f0840 7650 listTypeReleaseIterator(li);
ed9b544e 7651 } else {
a5eb649b 7652 dict *set;
ed9b544e 7653 dictIterator *di;
7654 dictEntry *setele;
7655
a5eb649b 7656 if (sortval->type == REDIS_SET) {
7657 set = sortval->ptr;
7658 } else {
7659 zset *zs = sortval->ptr;
7660 set = zs->dict;
7661 }
7662
ed9b544e 7663 di = dictGetIterator(set);
ed9b544e 7664 while((setele = dictNext(di)) != NULL) {
7665 vector[j].obj = dictGetEntryKey(setele);
7666 vector[j].u.score = 0;
7667 vector[j].u.cmpobj = NULL;
7668 j++;
7669 }
7670 dictReleaseIterator(di);
7671 }
dfc5e96c 7672 redisAssert(j == vectorlen);
ed9b544e 7673
7674 /* Now it's time to load the right scores in the sorting vector */
7675 if (dontsort == 0) {
7676 for (j = 0; j < vectorlen; j++) {
6d7d1370 7677 robj *byval;
ed9b544e 7678 if (sortby) {
6d7d1370 7679 /* lookup value to sort by */
3305306f 7680 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7681 if (!byval) continue;
ed9b544e 7682 } else {
6d7d1370
PN
7683 /* use object itself to sort by */
7684 byval = vector[j].obj;
7685 }
7686
7687 if (alpha) {
08ee9b57 7688 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7689 } else {
7690 if (byval->encoding == REDIS_ENCODING_RAW) {
7691 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7692 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7693 /* Don't need to decode the object if it's
7694 * integer-encoded (the only encoding supported) so
7695 * far. We can just cast it */
16fa22f1
PN
7696 vector[j].u.score = (long)byval->ptr;
7697 } else {
7698 redisAssert(1 != 1);
942a3961 7699 }
ed9b544e 7700 }
6d7d1370 7701
705dad38
PN
7702 /* when the object was retrieved using lookupKeyByPattern,
7703 * its refcount needs to be decreased. */
7704 if (sortby) {
7705 decrRefCount(byval);
ed9b544e 7706 }
7707 }
7708 }
7709
7710 /* We are ready to sort the vector... perform a bit of sanity check
7711 * on the LIMIT option too. We'll use a partial version of quicksort. */
7712 start = (limit_start < 0) ? 0 : limit_start;
7713 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7714 if (start >= vectorlen) {
7715 start = vectorlen-1;
7716 end = vectorlen-2;
7717 }
7718 if (end >= vectorlen) end = vectorlen-1;
7719
7720 if (dontsort == 0) {
7721 server.sort_desc = desc;
7722 server.sort_alpha = alpha;
7723 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7724 if (sortby && (start != 0 || end != vectorlen-1))
7725 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7726 else
7727 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7728 }
7729
7730 /* Send command output to the output buffer, performing the specified
7731 * GET/DEL/INCR/DECR operations if any. */
7732 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7733 if (storekey == NULL) {
7734 /* STORE option not specified, sent the sorting result to client */
7735 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7736 for (j = start; j <= end; j++) {
7737 listNode *ln;
c7df85a4 7738 listIter li;
7739
dd88747b 7740 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7741 listRewind(operations,&li);
7742 while((ln = listNext(&li))) {
443c6409 7743 redisSortOperation *sop = ln->value;
7744 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7745 vector[j].obj);
7746
7747 if (sop->type == REDIS_SORT_GET) {
55017f9d 7748 if (!val) {
443c6409 7749 addReply(c,shared.nullbulk);
7750 } else {
dd88747b 7751 addReplyBulk(c,val);
55017f9d 7752 decrRefCount(val);
443c6409 7753 }
7754 } else {
dfc5e96c 7755 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7756 }
7757 }
ed9b544e 7758 }
443c6409 7759 } else {
74e0f445 7760 robj *sobj = createZiplistObject();
443c6409 7761
7762 /* STORE option specified, set the sorting result as a List object */
7763 for (j = start; j <= end; j++) {
7764 listNode *ln;
c7df85a4 7765 listIter li;
7766
443c6409 7767 if (!getop) {
003f0840 7768 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
a03611e1
PN
7769 } else {
7770 listRewind(operations,&li);
7771 while((ln = listNext(&li))) {
7772 redisSortOperation *sop = ln->value;
7773 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7774 vector[j].obj);
7775
7776 if (sop->type == REDIS_SORT_GET) {
7777 if (!val) val = createStringObject("",0);
7778
003f0840 7779 /* listTypePush does an incrRefCount, so we should take care
a03611e1
PN
7780 * care of the incremented refcount caused by either
7781 * lookupKeyByPattern or createStringObject("",0) */
003f0840 7782 listTypePush(sobj,val,REDIS_TAIL);
a03611e1 7783 decrRefCount(val);
443c6409 7784 } else {
a03611e1
PN
7785 /* always fails */
7786 redisAssert(sop->type == REDIS_SORT_GET);
443c6409 7787 }
ed9b544e 7788 }
ed9b544e 7789 }
ed9b544e 7790 }
846d8b3e 7791 dbReplace(c->db,storekey,sobj);
443c6409 7792 /* Note: we add 1 because the DB is dirty anyway since even if the
7793 * SORT result is empty a new key is set and maybe the old content
7794 * replaced. */
7795 server.dirty += 1+outputlen;
7796 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7797 }
7798
7799 /* Cleanup */
a03611e1
PN
7800 if (sortval->type == REDIS_LIST)
7801 for (j = 0; j < vectorlen; j++)
7802 decrRefCount(vector[j].obj);
ed9b544e 7803 decrRefCount(sortval);
7804 listRelease(operations);
7805 for (j = 0; j < vectorlen; j++) {
16fa22f1 7806 if (alpha && vector[j].u.cmpobj)
ed9b544e 7807 decrRefCount(vector[j].u.cmpobj);
7808 }
7809 zfree(vector);
7810}
7811
ec6c7a1d 7812/* Convert an amount of bytes into a human readable string in the form
7813 * of 100B, 2G, 100M, 4K, and so forth. */
7814static void bytesToHuman(char *s, unsigned long long n) {
7815 double d;
7816
7817 if (n < 1024) {
7818 /* Bytes */
7819 sprintf(s,"%lluB",n);
7820 return;
7821 } else if (n < (1024*1024)) {
7822 d = (double)n/(1024);
7823 sprintf(s,"%.2fK",d);
7824 } else if (n < (1024LL*1024*1024)) {
7825 d = (double)n/(1024*1024);
7826 sprintf(s,"%.2fM",d);
7827 } else if (n < (1024LL*1024*1024*1024)) {
7828 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7829 sprintf(s,"%.2fG",d);
ec6c7a1d 7830 }
7831}
7832
1c85b79f 7833/* Create the string returned by the INFO command. This is decoupled
7834 * by the INFO command itself as we need to report the same information
7835 * on memory corruption problems. */
7836static sds genRedisInfoString(void) {
ed9b544e 7837 sds info;
7838 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7839 int j;
ec6c7a1d 7840 char hmem[64];
55a8298f 7841
b72f6a4b 7842 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7843 info = sdscatprintf(sdsempty(),
7844 "redis_version:%s\r\n"
5436146c
PN
7845 "redis_git_sha1:%s\r\n"
7846 "redis_git_dirty:%d\r\n"
f1017b3f 7847 "arch_bits:%s\r\n"
7a932b74 7848 "multiplexing_api:%s\r\n"
0d7170a4 7849 "process_id:%ld\r\n"
682ac724 7850 "uptime_in_seconds:%ld\r\n"
7851 "uptime_in_days:%ld\r\n"
ed9b544e 7852 "connected_clients:%d\r\n"
7853 "connected_slaves:%d\r\n"
f86a74e9 7854 "blocked_clients:%d\r\n"
5fba9f71 7855 "used_memory:%zu\r\n"
ec6c7a1d 7856 "used_memory_human:%s\r\n"
ed9b544e 7857 "changes_since_last_save:%lld\r\n"
be2bb6b0 7858 "bgsave_in_progress:%d\r\n"
682ac724 7859 "last_save_time:%ld\r\n"
b3fad521 7860 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7861 "total_connections_received:%lld\r\n"
7862 "total_commands_processed:%lld\r\n"
2a6a2ed1 7863 "expired_keys:%lld\r\n"
3be2c9d7 7864 "hash_max_zipmap_entries:%zu\r\n"
7865 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7866 "pubsub_channels:%ld\r\n"
7867 "pubsub_patterns:%u\r\n"
7d98e08c 7868 "vm_enabled:%d\r\n"
a0f643ea 7869 "role:%s\r\n"
ed9b544e 7870 ,REDIS_VERSION,
5436146c 7871 REDIS_GIT_SHA1,
274e45e3 7872 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7873 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7874 aeGetApiName(),
0d7170a4 7875 (long) getpid(),
a0f643ea 7876 uptime,
7877 uptime/(3600*24),
ed9b544e 7878 listLength(server.clients)-listLength(server.slaves),
7879 listLength(server.slaves),
d5d55fc3 7880 server.blpop_blocked_clients,
b72f6a4b 7881 zmalloc_used_memory(),
ec6c7a1d 7882 hmem,
ed9b544e 7883 server.dirty,
9d65a1bb 7884 server.bgsavechildpid != -1,
ed9b544e 7885 server.lastsave,
b3fad521 7886 server.bgrewritechildpid != -1,
ed9b544e 7887 server.stat_numconnections,
7888 server.stat_numcommands,
2a6a2ed1 7889 server.stat_expiredkeys,
55a8298f 7890 server.hash_max_zipmap_entries,
7891 server.hash_max_zipmap_value,
ffc6b7f8 7892 dictSize(server.pubsub_channels),
7893 listLength(server.pubsub_patterns),
7d98e08c 7894 server.vm_enabled != 0,
a0f643ea 7895 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7896 );
a0f643ea 7897 if (server.masterhost) {
7898 info = sdscatprintf(info,
7899 "master_host:%s\r\n"
7900 "master_port:%d\r\n"
7901 "master_link_status:%s\r\n"
7902 "master_last_io_seconds_ago:%d\r\n"
7903 ,server.masterhost,
7904 server.masterport,
7905 (server.replstate == REDIS_REPL_CONNECTED) ?
7906 "up" : "down",
f72b934d 7907 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7908 );
7909 }
7d98e08c 7910 if (server.vm_enabled) {
1064ef87 7911 lockThreadedIO();
7d98e08c 7912 info = sdscatprintf(info,
7913 "vm_conf_max_memory:%llu\r\n"
7914 "vm_conf_page_size:%llu\r\n"
7915 "vm_conf_pages:%llu\r\n"
7916 "vm_stats_used_pages:%llu\r\n"
7917 "vm_stats_swapped_objects:%llu\r\n"
7918 "vm_stats_swappin_count:%llu\r\n"
7919 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7920 "vm_stats_io_newjobs_len:%lu\r\n"
7921 "vm_stats_io_processing_len:%lu\r\n"
7922 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7923 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7924 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7925 ,(unsigned long long) server.vm_max_memory,
7926 (unsigned long long) server.vm_page_size,
7927 (unsigned long long) server.vm_pages,
7928 (unsigned long long) server.vm_stats_used_pages,
7929 (unsigned long long) server.vm_stats_swapped_objects,
7930 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7931 (unsigned long long) server.vm_stats_swapouts,
7932 (unsigned long) listLength(server.io_newjobs),
7933 (unsigned long) listLength(server.io_processing),
7934 (unsigned long) listLength(server.io_processed),
d5d55fc3 7935 (unsigned long) server.io_active_threads,
7936 (unsigned long) server.vm_blocked_clients
7d98e08c 7937 );
1064ef87 7938 unlockThreadedIO();
7d98e08c 7939 }
c3cb078d 7940 for (j = 0; j < server.dbnum; j++) {
7941 long long keys, vkeys;
7942
7943 keys = dictSize(server.db[j].dict);
7944 vkeys = dictSize(server.db[j].expires);
7945 if (keys || vkeys) {
9d65a1bb 7946 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7947 j, keys, vkeys);
7948 }
7949 }
1c85b79f 7950 return info;
7951}
7952
7953static void infoCommand(redisClient *c) {
7954 sds info = genRedisInfoString();
83c6a618 7955 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7956 (unsigned long)sdslen(info)));
ed9b544e 7957 addReplySds(c,info);
70003d28 7958 addReply(c,shared.crlf);
ed9b544e 7959}
7960
3305306f 7961static void monitorCommand(redisClient *c) {
7962 /* ignore MONITOR if aleady slave or in monitor mode */
7963 if (c->flags & REDIS_SLAVE) return;
7964
7965 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7966 c->slaveseldb = 0;
6b47e12e 7967 listAddNodeTail(server.monitors,c);
3305306f 7968 addReply(c,shared.ok);
7969}
7970
7971/* ================================= Expire ================================= */
7972static int removeExpire(redisDb *db, robj *key) {
09241813 7973 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
3305306f 7974 return 1;
7975 } else {
7976 return 0;
7977 }
7978}
7979
7980static int setExpire(redisDb *db, robj *key, time_t when) {
09241813 7981 sds copy = sdsdup(key->ptr);
7982 if (dictAdd(db->expires,copy,(void*)when) == DICT_ERR) {
7983 sdsfree(copy);
3305306f 7984 return 0;
7985 } else {
3305306f 7986 return 1;
7987 }
7988}
7989
bb32ede5 7990/* Return the expire time of the specified key, or -1 if no expire
7991 * is associated with this key (i.e. the key is non volatile) */
7992static time_t getExpire(redisDb *db, robj *key) {
7993 dictEntry *de;
7994
7995 /* No expire? return ASAP */
7996 if (dictSize(db->expires) == 0 ||
09241813 7997 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
bb32ede5 7998
7999 return (time_t) dictGetEntryVal(de);
8000}
8001
3305306f 8002static int expireIfNeeded(redisDb *db, robj *key) {
8003 time_t when;
8004 dictEntry *de;
8005
8006 /* No expire? return ASAP */
8007 if (dictSize(db->expires) == 0 ||
09241813 8008 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
3305306f 8009
8010 /* Lookup the expire */
8011 when = (time_t) dictGetEntryVal(de);
8012 if (time(NULL) <= when) return 0;
8013
8014 /* Delete the key */
09241813 8015 dbDelete(db,key);
2a6a2ed1 8016 server.stat_expiredkeys++;
09241813 8017 return 1;
3305306f 8018}
8019
8020static int deleteIfVolatile(redisDb *db, robj *key) {
8021 dictEntry *de;
8022
8023 /* No expire? return ASAP */
8024 if (dictSize(db->expires) == 0 ||
09241813 8025 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
3305306f 8026
8027 /* Delete the key */
0c66a471 8028 server.dirty++;
2a6a2ed1 8029 server.stat_expiredkeys++;
09241813 8030 dictDelete(db->expires,key->ptr);
8031 return dictDelete(db->dict,key->ptr) == DICT_OK;
3305306f 8032}
8033
bbe025e0 8034static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 8035 dictEntry *de;
bbe025e0
AM
8036 time_t seconds;
8037
bd79a6bd 8038 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
8039
8040 seconds -= offset;
3305306f 8041
09241813 8042 de = dictFind(c->db->dict,key->ptr);
3305306f 8043 if (de == NULL) {
8044 addReply(c,shared.czero);
8045 return;
8046 }
d4dd6556 8047 if (seconds <= 0) {
09241813 8048 if (dbDelete(c->db,key)) server.dirty++;
43e5ccdf 8049 addReply(c, shared.cone);
3305306f 8050 return;
8051 } else {
8052 time_t when = time(NULL)+seconds;
802e8373 8053 if (setExpire(c->db,key,when)) {
3305306f 8054 addReply(c,shared.cone);
77423026 8055 server.dirty++;
8056 } else {
3305306f 8057 addReply(c,shared.czero);
77423026 8058 }
3305306f 8059 return;
8060 }
8061}
8062
802e8373 8063static void expireCommand(redisClient *c) {
bbe025e0 8064 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 8065}
8066
8067static void expireatCommand(redisClient *c) {
bbe025e0 8068 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 8069}
8070
fd88489a 8071static void ttlCommand(redisClient *c) {
8072 time_t expire;
8073 int ttl = -1;
8074
8075 expire = getExpire(c->db,c->argv[1]);
8076 if (expire != -1) {
8077 ttl = (int) (expire-time(NULL));
8078 if (ttl < 0) ttl = -1;
8079 }
8080 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
8081}
8082
6e469882 8083/* ================================ MULTI/EXEC ============================== */
8084
8085/* Client state initialization for MULTI/EXEC */
8086static void initClientMultiState(redisClient *c) {
8087 c->mstate.commands = NULL;
8088 c->mstate.count = 0;
8089}
8090
8091/* Release all the resources associated with MULTI/EXEC state */
8092static void freeClientMultiState(redisClient *c) {
8093 int j;
8094
8095 for (j = 0; j < c->mstate.count; j++) {
8096 int i;
8097 multiCmd *mc = c->mstate.commands+j;
8098
8099 for (i = 0; i < mc->argc; i++)
8100 decrRefCount(mc->argv[i]);
8101 zfree(mc->argv);
8102 }
8103 zfree(c->mstate.commands);
8104}
8105
8106/* Add a new command into the MULTI commands queue */
8107static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
8108 multiCmd *mc;
8109 int j;
8110
8111 c->mstate.commands = zrealloc(c->mstate.commands,
8112 sizeof(multiCmd)*(c->mstate.count+1));
8113 mc = c->mstate.commands+c->mstate.count;
8114 mc->cmd = cmd;
8115 mc->argc = c->argc;
8116 mc->argv = zmalloc(sizeof(robj*)*c->argc);
8117 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
8118 for (j = 0; j < c->argc; j++)
8119 incrRefCount(mc->argv[j]);
8120 c->mstate.count++;
8121}
8122
8123static void multiCommand(redisClient *c) {
6531c94d 8124 if (c->flags & REDIS_MULTI) {
8125 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
8126 return;
8127 }
6e469882 8128 c->flags |= REDIS_MULTI;
36c548f0 8129 addReply(c,shared.ok);
6e469882 8130}
8131
18b6cb76
DJ
8132static void discardCommand(redisClient *c) {
8133 if (!(c->flags & REDIS_MULTI)) {
8134 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
8135 return;
8136 }
8137
8138 freeClientMultiState(c);
8139 initClientMultiState(c);
8140 c->flags &= (~REDIS_MULTI);
a2645226 8141 unwatchAllKeys(c);
18b6cb76
DJ
8142 addReply(c,shared.ok);
8143}
8144
66c8853f 8145/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
8146 * implememntation for more information. */
8147static void execCommandReplicateMulti(redisClient *c) {
8148 struct redisCommand *cmd;
8149 robj *multistring = createStringObject("MULTI",5);
8150
8151 cmd = lookupCommand("multi");
8152 if (server.appendonly)
8153 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8154 if (listLength(server.slaves))
8155 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8156 decrRefCount(multistring);
8157}
8158
6e469882 8159static void execCommand(redisClient *c) {
8160 int j;
8161 robj **orig_argv;
8162 int orig_argc;
8163
8164 if (!(c->flags & REDIS_MULTI)) {
8165 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8166 return;
8167 }
8168
37ab76c9 8169 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8170 * A failed EXEC will return a multi bulk nil object. */
8171 if (c->flags & REDIS_DIRTY_CAS) {
8172 freeClientMultiState(c);
8173 initClientMultiState(c);
8174 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8175 unwatchAllKeys(c);
8176 addReply(c,shared.nullmultibulk);
8177 return;
8178 }
8179
66c8853f 8180 /* Replicate a MULTI request now that we are sure the block is executed.
8181 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8182 * both the AOF and the replication link will have the same consistency
8183 * and atomicity guarantees. */
8184 execCommandReplicateMulti(c);
8185
8186 /* Exec all the queued commands */
1ad4d316 8187 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 8188 orig_argv = c->argv;
8189 orig_argc = c->argc;
8190 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8191 for (j = 0; j < c->mstate.count; j++) {
8192 c->argc = c->mstate.commands[j].argc;
8193 c->argv = c->mstate.commands[j].argv;
8194 call(c,c->mstate.commands[j].cmd);
8195 }
8196 c->argv = orig_argv;
8197 c->argc = orig_argc;
8198 freeClientMultiState(c);
8199 initClientMultiState(c);
1ad4d316 8200 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 8201 /* Make sure the EXEC command is always replicated / AOF, since we
8202 * always send the MULTI command (we can't know beforehand if the
8203 * next operations will contain at least a modification to the DB). */
8204 server.dirty++;
6e469882 8205}
8206
4409877e 8207/* =========================== Blocking Operations ========================= */
8208
8209/* Currently Redis blocking operations support is limited to list POP ops,
8210 * so the current implementation is not fully generic, but it is also not
8211 * completely specific so it will not require a rewrite to support new
8212 * kind of blocking operations in the future.
8213 *
8214 * Still it's important to note that list blocking operations can be already
8215 * used as a notification mechanism in order to implement other blocking
8216 * operations at application level, so there must be a very strong evidence
8217 * of usefulness and generality before new blocking operations are implemented.
8218 *
8219 * This is how the current blocking POP works, we use BLPOP as example:
8220 * - If the user calls BLPOP and the key exists and contains a non empty list
8221 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8222 * if there is not to block.
8223 * - If instead BLPOP is called and the key does not exists or the list is
8224 * empty we need to block. In order to do so we remove the notification for
8225 * new data to read in the client socket (so that we'll not serve new
8226 * requests if the blocking request is not served). Also we put the client
37ab76c9 8227 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 8228 * blocking for this keys.
8229 * - If a PUSH operation against a key with blocked clients waiting is
8230 * performed, we serve the first in the list: basically instead to push
8231 * the new element inside the list we return it to the (first / oldest)
8232 * blocking client, unblock the client, and remove it form the list.
8233 *
8234 * The above comment and the source code should be enough in order to understand
8235 * the implementation and modify / fix it later.
8236 */
8237
8238/* Set a client in blocking mode for the specified key, with the specified
8239 * timeout */
b177fd30 8240static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 8241 dictEntry *de;
8242 list *l;
b177fd30 8243 int j;
4409877e 8244
37ab76c9 8245 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8246 c->blocking_keys_num = numkeys;
4409877e 8247 c->blockingto = timeout;
b177fd30 8248 for (j = 0; j < numkeys; j++) {
8249 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 8250 c->blocking_keys[j] = keys[j];
b177fd30 8251 incrRefCount(keys[j]);
4409877e 8252
b177fd30 8253 /* And in the other "side", to map keys -> clients */
37ab76c9 8254 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 8255 if (de == NULL) {
8256 int retval;
8257
8258 /* For every key we take a list of clients blocked for it */
8259 l = listCreate();
37ab76c9 8260 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 8261 incrRefCount(keys[j]);
8262 assert(retval == DICT_OK);
8263 } else {
8264 l = dictGetEntryVal(de);
8265 }
8266 listAddNodeTail(l,c);
4409877e 8267 }
b177fd30 8268 /* Mark the client as a blocked client */
4409877e 8269 c->flags |= REDIS_BLOCKED;
d5d55fc3 8270 server.blpop_blocked_clients++;
4409877e 8271}
8272
8273/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 8274static void unblockClientWaitingData(redisClient *c) {
4409877e 8275 dictEntry *de;
8276 list *l;
b177fd30 8277 int j;
4409877e 8278
37ab76c9 8279 assert(c->blocking_keys != NULL);
b177fd30 8280 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 8281 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 8282 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 8283 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 8284 assert(de != NULL);
8285 l = dictGetEntryVal(de);
8286 listDelNode(l,listSearchKey(l,c));
8287 /* If the list is empty we need to remove it to avoid wasting memory */
8288 if (listLength(l) == 0)
37ab76c9 8289 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8290 decrRefCount(c->blocking_keys[j]);
b177fd30 8291 }
8292 /* Cleanup the client structure */
37ab76c9 8293 zfree(c->blocking_keys);
8294 c->blocking_keys = NULL;
4409877e 8295 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 8296 server.blpop_blocked_clients--;
5921aa36 8297 /* We want to process data if there is some command waiting
b0d8747d 8298 * in the input buffer. Note that this is safe even if
8299 * unblockClientWaitingData() gets called from freeClient() because
8300 * freeClient() will be smart enough to call this function
8301 * *after* c->querybuf was set to NULL. */
4409877e 8302 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8303}
8304
8305/* This should be called from any function PUSHing into lists.
8306 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8307 * 'ele' is the element pushed.
8308 *
8309 * If the function returns 0 there was no client waiting for a list push
8310 * against this key.
8311 *
8312 * If the function returns 1 there was a client waiting for a list push
8313 * against this key, the element was passed to this client thus it's not
8314 * needed to actually add it to the list and the caller should return asap. */
8315static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8316 struct dictEntry *de;
8317 redisClient *receiver;
8318 list *l;
8319 listNode *ln;
8320
37ab76c9 8321 de = dictFind(c->db->blocking_keys,key);
4409877e 8322 if (de == NULL) return 0;
8323 l = dictGetEntryVal(de);
8324 ln = listFirst(l);
8325 assert(ln != NULL);
8326 receiver = ln->value;
4409877e 8327
b177fd30 8328 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 8329 addReplyBulk(receiver,key);
8330 addReplyBulk(receiver,ele);
b0d8747d 8331 unblockClientWaitingData(receiver);
4409877e 8332 return 1;
8333}
8334
8335/* Blocking RPOP/LPOP */
8336static void blockingPopGenericCommand(redisClient *c, int where) {
8337 robj *o;
8338 time_t timeout;
b177fd30 8339 int j;
4409877e 8340
b177fd30 8341 for (j = 1; j < c->argc-1; j++) {
8342 o = lookupKeyWrite(c->db,c->argv[j]);
8343 if (o != NULL) {
8344 if (o->type != REDIS_LIST) {
8345 addReply(c,shared.wrongtypeerr);
4409877e 8346 return;
b177fd30 8347 } else {
8348 list *list = o->ptr;
8349 if (listLength(list) != 0) {
8350 /* If the list contains elements fall back to the usual
8351 * non-blocking POP operation */
8352 robj *argv[2], **orig_argv;
8353 int orig_argc;
e0a62c7f 8354
b177fd30 8355 /* We need to alter the command arguments before to call
8356 * popGenericCommand() as the command takes a single key. */
8357 orig_argv = c->argv;
8358 orig_argc = c->argc;
8359 argv[1] = c->argv[j];
8360 c->argv = argv;
8361 c->argc = 2;
8362
8363 /* Also the return value is different, we need to output
8364 * the multi bulk reply header and the key name. The
8365 * "real" command will add the last element (the value)
8366 * for us. If this souds like an hack to you it's just
8367 * because it is... */
8368 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 8369 addReplyBulk(c,argv[1]);
b177fd30 8370 popGenericCommand(c,where);
8371
8372 /* Fix the client structure with the original stuff */
8373 c->argv = orig_argv;
8374 c->argc = orig_argc;
8375 return;
8376 }
4409877e 8377 }
8378 }
8379 }
8380 /* If the list is empty or the key does not exists we must block */
b177fd30 8381 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 8382 if (timeout > 0) timeout += time(NULL);
b177fd30 8383 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 8384}
8385
8386static void blpopCommand(redisClient *c) {
8387 blockingPopGenericCommand(c,REDIS_HEAD);
8388}
8389
8390static void brpopCommand(redisClient *c) {
8391 blockingPopGenericCommand(c,REDIS_TAIL);
8392}
8393
ed9b544e 8394/* =============================== Replication ============================= */
8395
a4d1ba9a 8396static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8397 ssize_t nwritten, ret = size;
8398 time_t start = time(NULL);
8399
8400 timeout++;
8401 while(size) {
8402 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8403 nwritten = write(fd,ptr,size);
8404 if (nwritten == -1) return -1;
8405 ptr += nwritten;
8406 size -= nwritten;
8407 }
8408 if ((time(NULL)-start) > timeout) {
8409 errno = ETIMEDOUT;
8410 return -1;
8411 }
8412 }
8413 return ret;
8414}
8415
a4d1ba9a 8416static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8417 ssize_t nread, totread = 0;
8418 time_t start = time(NULL);
8419
8420 timeout++;
8421 while(size) {
8422 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8423 nread = read(fd,ptr,size);
8424 if (nread == -1) return -1;
8425 ptr += nread;
8426 size -= nread;
8427 totread += nread;
8428 }
8429 if ((time(NULL)-start) > timeout) {
8430 errno = ETIMEDOUT;
8431 return -1;
8432 }
8433 }
8434 return totread;
8435}
8436
8437static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8438 ssize_t nread = 0;
8439
8440 size--;
8441 while(size) {
8442 char c;
8443
8444 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8445 if (c == '\n') {
8446 *ptr = '\0';
8447 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8448 return nread;
8449 } else {
8450 *ptr++ = c;
8451 *ptr = '\0';
8452 nread++;
8453 }
8454 }
8455 return nread;
8456}
8457
8458static void syncCommand(redisClient *c) {
40d224a9 8459 /* ignore SYNC if aleady slave or in monitor mode */
8460 if (c->flags & REDIS_SLAVE) return;
8461
8462 /* SYNC can't be issued when the server has pending data to send to
8463 * the client about already issued commands. We need a fresh reply
8464 * buffer registering the differences between the BGSAVE and the current
8465 * dataset, so that we can copy to other slaves if needed. */
8466 if (listLength(c->reply) != 0) {
8467 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8468 return;
8469 }
8470
8471 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8472 /* Here we need to check if there is a background saving operation
8473 * in progress, or if it is required to start one */
9d65a1bb 8474 if (server.bgsavechildpid != -1) {
40d224a9 8475 /* Ok a background save is in progress. Let's check if it is a good
8476 * one for replication, i.e. if there is another slave that is
8477 * registering differences since the server forked to save */
8478 redisClient *slave;
8479 listNode *ln;
c7df85a4 8480 listIter li;
40d224a9 8481
c7df85a4 8482 listRewind(server.slaves,&li);
8483 while((ln = listNext(&li))) {
40d224a9 8484 slave = ln->value;
8485 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 8486 }
8487 if (ln) {
8488 /* Perfect, the server is already registering differences for
8489 * another slave. Set the right state, and copy the buffer. */
8490 listRelease(c->reply);
8491 c->reply = listDup(slave->reply);
40d224a9 8492 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8493 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8494 } else {
8495 /* No way, we need to wait for the next BGSAVE in order to
8496 * register differences */
8497 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8498 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8499 }
8500 } else {
8501 /* Ok we don't have a BGSAVE in progress, let's start one */
8502 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8503 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8504 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8505 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8506 return;
8507 }
8508 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8509 }
6208b3a7 8510 c->repldbfd = -1;
40d224a9 8511 c->flags |= REDIS_SLAVE;
8512 c->slaveseldb = 0;
6b47e12e 8513 listAddNodeTail(server.slaves,c);
40d224a9 8514 return;
8515}
8516
6208b3a7 8517static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8518 redisClient *slave = privdata;
8519 REDIS_NOTUSED(el);
8520 REDIS_NOTUSED(mask);
8521 char buf[REDIS_IOBUF_LEN];
8522 ssize_t nwritten, buflen;
8523
8524 if (slave->repldboff == 0) {
8525 /* Write the bulk write count before to transfer the DB. In theory here
8526 * we don't know how much room there is in the output buffer of the
8527 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8528 * operations) will never be smaller than the few bytes we need. */
8529 sds bulkcount;
8530
8531 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8532 slave->repldbsize);
8533 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8534 {
8535 sdsfree(bulkcount);
8536 freeClient(slave);
8537 return;
8538 }
8539 sdsfree(bulkcount);
8540 }
8541 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8542 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8543 if (buflen <= 0) {
8544 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8545 (buflen == 0) ? "premature EOF" : strerror(errno));
8546 freeClient(slave);
8547 return;
8548 }
8549 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 8550 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 8551 strerror(errno));
8552 freeClient(slave);
8553 return;
8554 }
8555 slave->repldboff += nwritten;
8556 if (slave->repldboff == slave->repldbsize) {
8557 close(slave->repldbfd);
8558 slave->repldbfd = -1;
8559 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8560 slave->replstate = REDIS_REPL_ONLINE;
8561 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 8562 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 8563 freeClient(slave);
8564 return;
8565 }
8566 addReplySds(slave,sdsempty());
8567 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8568 }
8569}
ed9b544e 8570
a3b21203 8571/* This function is called at the end of every backgrond saving.
8572 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8573 * otherwise REDIS_ERR is passed to the function.
8574 *
8575 * The goal of this function is to handle slaves waiting for a successful
8576 * background saving in order to perform non-blocking synchronization. */
8577static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 8578 listNode *ln;
8579 int startbgsave = 0;
c7df85a4 8580 listIter li;
ed9b544e 8581
c7df85a4 8582 listRewind(server.slaves,&li);
8583 while((ln = listNext(&li))) {
6208b3a7 8584 redisClient *slave = ln->value;
ed9b544e 8585
6208b3a7 8586 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8587 startbgsave = 1;
8588 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8589 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 8590 struct redis_stat buf;
e0a62c7f 8591
6208b3a7 8592 if (bgsaveerr != REDIS_OK) {
8593 freeClient(slave);
8594 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8595 continue;
8596 }
8597 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 8598 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 8599 freeClient(slave);
8600 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8601 continue;
8602 }
8603 slave->repldboff = 0;
8604 slave->repldbsize = buf.st_size;
8605 slave->replstate = REDIS_REPL_SEND_BULK;
8606 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 8607 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 8608 freeClient(slave);
8609 continue;
8610 }
8611 }
ed9b544e 8612 }
6208b3a7 8613 if (startbgsave) {
8614 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8615 listIter li;
8616
8617 listRewind(server.slaves,&li);
6208b3a7 8618 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8619 while((ln = listNext(&li))) {
6208b3a7 8620 redisClient *slave = ln->value;
ed9b544e 8621
6208b3a7 8622 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8623 freeClient(slave);
8624 }
8625 }
8626 }
ed9b544e 8627}
8628
8629static int syncWithMaster(void) {
d0ccebcf 8630 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8631 long dumpsize;
ed9b544e 8632 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8633 int dfd, maxtries = 5;
ed9b544e 8634
8635 if (fd == -1) {
8636 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8637 strerror(errno));
8638 return REDIS_ERR;
8639 }
d0ccebcf 8640
8641 /* AUTH with the master if required. */
8642 if(server.masterauth) {
8643 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8644 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8645 close(fd);
8646 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8647 strerror(errno));
8648 return REDIS_ERR;
8649 }
8650 /* Read the AUTH result. */
8651 if (syncReadLine(fd,buf,1024,3600) == -1) {
8652 close(fd);
8653 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8654 strerror(errno));
8655 return REDIS_ERR;
8656 }
8657 if (buf[0] != '+') {
8658 close(fd);
8659 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8660 return REDIS_ERR;
8661 }
8662 }
8663
ed9b544e 8664 /* Issue the SYNC command */
8665 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8666 close(fd);
8667 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8668 strerror(errno));
8669 return REDIS_ERR;
8670 }
8671 /* Read the bulk write count */
8c4d91fc 8672 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8673 close(fd);
8674 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8675 strerror(errno));
8676 return REDIS_ERR;
8677 }
4aa701c1 8678 if (buf[0] != '$') {
8679 close(fd);
8680 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8681 return REDIS_ERR;
8682 }
18e61fa2 8683 dumpsize = strtol(buf+1,NULL,10);
8684 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8685 /* Read the bulk write data on a temp file */
8c5abee8 8686 while(maxtries--) {
8687 snprintf(tmpfile,256,
8688 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8689 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8690 if (dfd != -1) break;
5de9ad7c 8691 sleep(1);
8c5abee8 8692 }
ed9b544e 8693 if (dfd == -1) {
8694 close(fd);
8695 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8696 return REDIS_ERR;
8697 }
8698 while(dumpsize) {
8699 int nread, nwritten;
8700
8701 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8702 if (nread == -1) {
8703 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8704 strerror(errno));
8705 close(fd);
8706 close(dfd);
8707 return REDIS_ERR;
8708 }
8709 nwritten = write(dfd,buf,nread);
8710 if (nwritten == -1) {
8711 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8712 close(fd);
8713 close(dfd);
8714 return REDIS_ERR;
8715 }
8716 dumpsize -= nread;
8717 }
8718 close(dfd);
8719 if (rename(tmpfile,server.dbfilename) == -1) {
8720 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8721 unlink(tmpfile);
8722 close(fd);
8723 return REDIS_ERR;
8724 }
8725 emptyDb();
f78fd11b 8726 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8727 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8728 close(fd);
8729 return REDIS_ERR;
8730 }
8731 server.master = createClient(fd);
8732 server.master->flags |= REDIS_MASTER;
179b3952 8733 server.master->authenticated = 1;
ed9b544e 8734 server.replstate = REDIS_REPL_CONNECTED;
8735 return REDIS_OK;
8736}
8737
321b0e13 8738static void slaveofCommand(redisClient *c) {
8739 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8740 !strcasecmp(c->argv[2]->ptr,"one")) {
8741 if (server.masterhost) {
8742 sdsfree(server.masterhost);
8743 server.masterhost = NULL;
8744 if (server.master) freeClient(server.master);
8745 server.replstate = REDIS_REPL_NONE;
8746 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8747 }
8748 } else {
8749 sdsfree(server.masterhost);
8750 server.masterhost = sdsdup(c->argv[1]->ptr);
8751 server.masterport = atoi(c->argv[2]->ptr);
8752 if (server.master) freeClient(server.master);
8753 server.replstate = REDIS_REPL_CONNECT;
8754 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8755 server.masterhost, server.masterport);
8756 }
8757 addReply(c,shared.ok);
8758}
8759
3fd78bcd 8760/* ============================ Maxmemory directive ======================== */
8761
a5819310 8762/* Try to free one object form the pre-allocated objects free list.
8763 * This is useful under low mem conditions as by default we take 1 million
8764 * free objects allocated. On success REDIS_OK is returned, otherwise
8765 * REDIS_ERR. */
8766static int tryFreeOneObjectFromFreelist(void) {
f870935d 8767 robj *o;
8768
a5819310 8769 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8770 if (listLength(server.objfreelist)) {
8771 listNode *head = listFirst(server.objfreelist);
8772 o = listNodeValue(head);
8773 listDelNode(server.objfreelist,head);
8774 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8775 zfree(o);
8776 return REDIS_OK;
8777 } else {
8778 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8779 return REDIS_ERR;
8780 }
f870935d 8781}
8782
3fd78bcd 8783/* This function gets called when 'maxmemory' is set on the config file to limit
8784 * the max memory used by the server, and we are out of memory.
8785 * This function will try to, in order:
8786 *
8787 * - Free objects from the free list
8788 * - Try to remove keys with an EXPIRE set
8789 *
8790 * It is not possible to free enough memory to reach used-memory < maxmemory
8791 * the server will start refusing commands that will enlarge even more the
8792 * memory usage.
8793 */
8794static void freeMemoryIfNeeded(void) {
8795 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8796 int j, k, freed = 0;
8797
8798 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8799 for (j = 0; j < server.dbnum; j++) {
8800 int minttl = -1;
8801 robj *minkey = NULL;
8802 struct dictEntry *de;
8803
8804 if (dictSize(server.db[j].expires)) {
8805 freed = 1;
8806 /* From a sample of three keys drop the one nearest to
8807 * the natural expire */
8808 for (k = 0; k < 3; k++) {
8809 time_t t;
8810
8811 de = dictGetRandomKey(server.db[j].expires);
8812 t = (time_t) dictGetEntryVal(de);
8813 if (minttl == -1 || t < minttl) {
8814 minkey = dictGetEntryKey(de);
8815 minttl = t;
3fd78bcd 8816 }
3fd78bcd 8817 }
09241813 8818 dbDelete(server.db+j,minkey);
3fd78bcd 8819 }
3fd78bcd 8820 }
a5819310 8821 if (!freed) return; /* nothing to free... */
3fd78bcd 8822 }
8823}
8824
f80dff62 8825/* ============================== Append Only file ========================== */
8826
560db612 8827/* Called when the user switches from "appendonly yes" to "appendonly no"
8828 * at runtime using the CONFIG command. */
8829static void stopAppendOnly(void) {
8830 flushAppendOnlyFile();
8831 aof_fsync(server.appendfd);
8832 close(server.appendfd);
8833
8834 server.appendfd = -1;
8835 server.appendseldb = -1;
8836 server.appendonly = 0;
8837 /* rewrite operation in progress? kill it, wait child exit */
8838 if (server.bgsavechildpid != -1) {
8839 int statloc;
8840
8841 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8842 wait3(&statloc,0,NULL);
8843 /* reset the buffer accumulating changes while the child saves */
8844 sdsfree(server.bgrewritebuf);
8845 server.bgrewritebuf = sdsempty();
8846 server.bgsavechildpid = -1;
8847 }
8848}
8849
8850/* Called when the user switches from "appendonly no" to "appendonly yes"
8851 * at runtime using the CONFIG command. */
8852static int startAppendOnly(void) {
8853 server.appendonly = 1;
8854 server.lastfsync = time(NULL);
8855 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8856 if (server.appendfd == -1) {
8857 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8858 return REDIS_ERR;
8859 }
8860 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8861 server.appendonly = 0;
8862 close(server.appendfd);
8863 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8864 return REDIS_ERR;
8865 }
8866 return REDIS_OK;
8867}
8868
28ed1f33 8869/* Write the append only file buffer on disk.
8870 *
8871 * Since we are required to write the AOF before replying to the client,
8872 * and the only way the client socket can get a write is entering when the
8873 * the event loop, we accumulate all the AOF writes in a memory
8874 * buffer and write it on disk using this function just before entering
8875 * the event loop again. */
8876static void flushAppendOnlyFile(void) {
8877 time_t now;
8878 ssize_t nwritten;
8879
8880 if (sdslen(server.aofbuf) == 0) return;
8881
8882 /* We want to perform a single write. This should be guaranteed atomic
8883 * at least if the filesystem we are writing is a real physical one.
8884 * While this will save us against the server being killed I don't think
8885 * there is much to do about the whole server stopping for power problems
8886 * or alike */
8887 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8888 if (nwritten != (signed)sdslen(server.aofbuf)) {
8889 /* Ooops, we are in troubles. The best thing to do for now is
8890 * aborting instead of giving the illusion that everything is
8891 * working as expected. */
8892 if (nwritten == -1) {
8893 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8894 } else {
8895 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8896 }
8897 exit(1);
8898 }
8899 sdsfree(server.aofbuf);
8900 server.aofbuf = sdsempty();
8901
38db9171 8902 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8903 * childs performing heavy I/O on disk. */
8904 if (server.no_appendfsync_on_rewrite &&
8905 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8906 return;
28ed1f33 8907 /* Fsync if needed */
8908 now = time(NULL);
8909 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8910 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8911 now-server.lastfsync > 1))
8912 {
8913 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8914 * flushing metadata. */
8915 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8916 server.lastfsync = now;
8917 }
8918}
8919
9376e434
PN
8920static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8921 int j;
8922 buf = sdscatprintf(buf,"*%d\r\n",argc);
8923 for (j = 0; j < argc; j++) {
8924 robj *o = getDecodedObject(argv[j]);
8925 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8926 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8927 buf = sdscatlen(buf,"\r\n",2);
8928 decrRefCount(o);
8929 }
8930 return buf;
8931}
8932
8933static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8934 int argc = 3;
8935 long when;
8936 robj *argv[3];
8937
8938 /* Make sure we can use strtol */
8939 seconds = getDecodedObject(seconds);
8940 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8941 decrRefCount(seconds);
8942
8943 argv[0] = createStringObject("EXPIREAT",8);
8944 argv[1] = key;
8945 argv[2] = createObject(REDIS_STRING,
8946 sdscatprintf(sdsempty(),"%ld",when));
8947 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8948 decrRefCount(argv[0]);
8949 decrRefCount(argv[2]);
8950 return buf;
8951}
8952
f80dff62 8953static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8954 sds buf = sdsempty();
f80dff62 8955 robj *tmpargv[3];
8956
8957 /* The DB this command was targetting is not the same as the last command
8958 * we appendend. To issue a SELECT command is needed. */
8959 if (dictid != server.appendseldb) {
8960 char seldb[64];
8961
8962 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8963 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8964 (unsigned long)strlen(seldb),seldb);
f80dff62 8965 server.appendseldb = dictid;
8966 }
8967
f80dff62 8968 if (cmd->proc == expireCommand) {
9376e434
PN
8969 /* Translate EXPIRE into EXPIREAT */
8970 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8971 } else if (cmd->proc == setexCommand) {
8972 /* Translate SETEX to SET and EXPIREAT */
8973 tmpargv[0] = createStringObject("SET",3);
f80dff62 8974 tmpargv[1] = argv[1];
9376e434
PN
8975 tmpargv[2] = argv[3];
8976 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8977 decrRefCount(tmpargv[0]);
8978 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8979 } else {
8980 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8981 }
8982
28ed1f33 8983 /* Append to the AOF buffer. This will be flushed on disk just before
8984 * of re-entering the event loop, so before the client will get a
8985 * positive reply about the operation performed. */
8986 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8987
85a83172 8988 /* If a background append only file rewriting is in progress we want to
8989 * accumulate the differences between the child DB and the current one
8990 * in a buffer, so that when the child process will do its work we
8991 * can append the differences to the new append only file. */
8992 if (server.bgrewritechildpid != -1)
8993 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8994
8995 sdsfree(buf);
f80dff62 8996}
8997
8998/* In Redis commands are always executed in the context of a client, so in
8999 * order to load the append only file we need to create a fake client. */
9000static struct redisClient *createFakeClient(void) {
9001 struct redisClient *c = zmalloc(sizeof(*c));
9002
9003 selectDb(c,0);
9004 c->fd = -1;
9005 c->querybuf = sdsempty();
9006 c->argc = 0;
9007 c->argv = NULL;
9008 c->flags = 0;
9387d17d 9009 /* We set the fake client as a slave waiting for the synchronization
9010 * so that Redis will not try to send replies to this client. */
9011 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 9012 c->reply = listCreate();
9013 listSetFreeMethod(c->reply,decrRefCount);
9014 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 9015 initClientMultiState(c);
f80dff62 9016 return c;
9017}
9018
9019static void freeFakeClient(struct redisClient *c) {
9020 sdsfree(c->querybuf);
9021 listRelease(c->reply);
4132ad8d 9022 freeClientMultiState(c);
f80dff62 9023 zfree(c);
9024}
9025
9026/* Replay the append log file. On error REDIS_OK is returned. On non fatal
9027 * error (the append only file is zero-length) REDIS_ERR is returned. On
9028 * fatal error an error message is logged and the program exists. */
9029int loadAppendOnlyFile(char *filename) {
9030 struct redisClient *fakeClient;
9031 FILE *fp = fopen(filename,"r");
9032 struct redis_stat sb;
4132ad8d 9033 int appendonly = server.appendonly;
f80dff62 9034
9035 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
9036 return REDIS_ERR;
9037
9038 if (fp == NULL) {
9039 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
9040 exit(1);
9041 }
9042
4132ad8d
PN
9043 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
9044 * to the same file we're about to read. */
9045 server.appendonly = 0;
9046
f80dff62 9047 fakeClient = createFakeClient();
9048 while(1) {
9049 int argc, j;
9050 unsigned long len;
9051 robj **argv;
9052 char buf[128];
9053 sds argsds;
9054 struct redisCommand *cmd;
a89b7013 9055 int force_swapout;
f80dff62 9056
9057 if (fgets(buf,sizeof(buf),fp) == NULL) {
9058 if (feof(fp))
9059 break;
9060 else
9061 goto readerr;
9062 }
9063 if (buf[0] != '*') goto fmterr;
9064 argc = atoi(buf+1);
9065 argv = zmalloc(sizeof(robj*)*argc);
9066 for (j = 0; j < argc; j++) {
9067 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
9068 if (buf[0] != '$') goto fmterr;
9069 len = strtol(buf+1,NULL,10);
9070 argsds = sdsnewlen(NULL,len);
0f151ef1 9071 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 9072 argv[j] = createObject(REDIS_STRING,argsds);
9073 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
9074 }
9075
9076 /* Command lookup */
9077 cmd = lookupCommand(argv[0]->ptr);
9078 if (!cmd) {
9079 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
9080 exit(1);
9081 }
bdcb92f2 9082 /* Try object encoding */
f80dff62 9083 if (cmd->flags & REDIS_CMD_BULK)
05df7621 9084 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 9085 /* Run the command in the context of a fake client */
9086 fakeClient->argc = argc;
9087 fakeClient->argv = argv;
9088 cmd->proc(fakeClient);
9089 /* Discard the reply objects list from the fake client */
9090 while(listLength(fakeClient->reply))
9091 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
9092 /* Clean up, ready for the next command */
9093 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
9094 zfree(argv);
b492cf00 9095 /* Handle swapping while loading big datasets when VM is on */
a89b7013 9096 force_swapout = 0;
9097 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
9098 force_swapout = 1;
9099
9100 if (server.vm_enabled && force_swapout) {
b492cf00 9101 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 9102 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 9103 }
9104 }
f80dff62 9105 }
4132ad8d
PN
9106
9107 /* This point can only be reached when EOF is reached without errors.
9108 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
9109 if (fakeClient->flags & REDIS_MULTI) goto readerr;
9110
f80dff62 9111 fclose(fp);
9112 freeFakeClient(fakeClient);
4132ad8d 9113 server.appendonly = appendonly;
f80dff62 9114 return REDIS_OK;
9115
9116readerr:
9117 if (feof(fp)) {
9118 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
9119 } else {
9120 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
9121 }
9122 exit(1);
9123fmterr:
9124 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
9125 exit(1);
9126}
9127
9c8e3cee 9128/* Write binary-safe string into a file in the bulkformat
9129 * $<count>\r\n<payload>\r\n */
9130static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
9eaef89f
PN
9131 char cbuf[128];
9132 int clen;
9133 cbuf[0] = '$';
9134 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
9135 cbuf[clen++] = '\r';
9136 cbuf[clen++] = '\n';
9137 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
9138 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9c8e3cee 9139 if (fwrite("\r\n",2,1,fp) == 0) return 0;
9140 return 1;
9141}
9142
9d65a1bb 9143/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
9144static int fwriteBulkDouble(FILE *fp, double d) {
9145 char buf[128], dbuf[128];
9146
9147 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
9148 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
9149 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
9150 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
9151 return 1;
9152}
9153
9154/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9eaef89f
PN
9155static int fwriteBulkLongLong(FILE *fp, long long l) {
9156 char bbuf[128], lbuf[128];
9157 unsigned int blen, llen;
9158 llen = ll2string(lbuf,32,l);
9159 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9160 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9d65a1bb 9161 return 1;
9162}
9163
9eaef89f
PN
9164/* Delegate writing an object to writing a bulk string or bulk long long. */
9165static int fwriteBulkObject(FILE *fp, robj *obj) {
9166 /* Avoid using getDecodedObject to help copy-on-write (we are often
9167 * in a child process when this function is called). */
9168 if (obj->encoding == REDIS_ENCODING_INT) {
9169 return fwriteBulkLongLong(fp,(long)obj->ptr);
9170 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9171 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9172 } else {
9173 redisPanic("Unknown string encoding");
9174 }
9175}
9176
9d65a1bb 9177/* Write a sequence of commands able to fully rebuild the dataset into
9178 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9179static int rewriteAppendOnlyFile(char *filename) {
9180 dictIterator *di = NULL;
9181 dictEntry *de;
9182 FILE *fp;
9183 char tmpfile[256];
9184 int j;
9185 time_t now = time(NULL);
9186
9187 /* Note that we have to use a different temp name here compared to the
9188 * one used by rewriteAppendOnlyFileBackground() function. */
9189 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9190 fp = fopen(tmpfile,"w");
9191 if (!fp) {
9192 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9193 return REDIS_ERR;
9194 }
9195 for (j = 0; j < server.dbnum; j++) {
9196 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9197 redisDb *db = server.db+j;
9198 dict *d = db->dict;
9199 if (dictSize(d) == 0) continue;
9200 di = dictGetIterator(d);
9201 if (!di) {
9202 fclose(fp);
9203 return REDIS_ERR;
9204 }
9205
9206 /* SELECT the new DB */
9207 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9eaef89f 9208 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9d65a1bb 9209
9210 /* Iterate this DB writing every entry */
9211 while((de = dictNext(di)) != NULL) {
09241813 9212 sds keystr = dictGetEntryKey(de);
9213 robj key, *o;
e7546c63 9214 time_t expiretime;
9215 int swapped;
9216
09241813 9217 keystr = dictGetEntryKey(de);
560db612 9218 o = dictGetEntryVal(de);
09241813 9219 initStaticStringObject(key,keystr);
b9bc0eef 9220 /* If the value for this key is swapped, load a preview in memory.
9221 * We use a "swapped" flag to remember if we need to free the
9222 * value object instead to just increment the ref count anyway
9223 * in order to avoid copy-on-write of pages if we are forked() */
560db612 9224 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9225 o->storage == REDIS_VM_SWAPPING) {
e7546c63 9226 swapped = 0;
9227 } else {
560db612 9228 o = vmPreviewObject(o);
e7546c63 9229 swapped = 1;
9230 }
09241813 9231 expiretime = getExpire(db,&key);
9d65a1bb 9232
9233 /* Save the key and associated value */
9d65a1bb 9234 if (o->type == REDIS_STRING) {
9235 /* Emit a SET command */
9236 char cmd[]="*3\r\n$3\r\nSET\r\n";
9237 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9238 /* Key and value */
09241813 9239 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9240 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 9241 } else if (o->type == REDIS_LIST) {
9242 /* Emit the RPUSHes needed to rebuild the list */
6ddc908a
PN
9243 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9244 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9245 unsigned char *zl = o->ptr;
9246 unsigned char *p = ziplistIndex(zl,0);
9247 unsigned char *vstr;
9248 unsigned int vlen;
9249 long long vlong;
9250
9251 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9252 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
846d8b3e 9253 if (fwriteBulkObject(fp,&key) == 0) goto werr;
6ddc908a
PN
9254 if (vstr) {
9255 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9256 goto werr;
9257 } else {
9258 if (fwriteBulkLongLong(fp,vlong) == 0)
9259 goto werr;
9260 }
9261 p = ziplistNext(zl,p);
9262 }
9263 } else if (o->encoding == REDIS_ENCODING_LIST) {
9264 list *list = o->ptr;
9265 listNode *ln;
9266 listIter li;
9267
9268 listRewind(list,&li);
9269 while((ln = listNext(&li))) {
9270 robj *eleobj = listNodeValue(ln);
9271
9272 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
846d8b3e 9273 if (fwriteBulkObject(fp,&key) == 0) goto werr;
6ddc908a
PN
9274 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9275 }
9276 } else {
9277 redisPanic("Unknown list encoding");
9d65a1bb 9278 }
9279 } else if (o->type == REDIS_SET) {
9280 /* Emit the SADDs needed to rebuild the set */
9281 dict *set = o->ptr;
9282 dictIterator *di = dictGetIterator(set);
9283 dictEntry *de;
9284
9285 while((de = dictNext(di)) != NULL) {
9286 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9287 robj *eleobj = dictGetEntryKey(de);
9288
9289 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9290 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9291 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 9292 }
9293 dictReleaseIterator(di);
9294 } else if (o->type == REDIS_ZSET) {
9295 /* Emit the ZADDs needed to rebuild the sorted set */
9296 zset *zs = o->ptr;
9297 dictIterator *di = dictGetIterator(zs->dict);
9298 dictEntry *de;
9299
9300 while((de = dictNext(di)) != NULL) {
9301 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9302 robj *eleobj = dictGetEntryKey(de);
9303 double *score = dictGetEntryVal(de);
9304
9305 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9306 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9d65a1bb 9307 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 9308 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 9309 }
9310 dictReleaseIterator(di);
9c8e3cee 9311 } else if (o->type == REDIS_HASH) {
9312 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9313
9314 /* Emit the HSETs needed to rebuild the hash */
9315 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9316 unsigned char *p = zipmapRewind(o->ptr);
9317 unsigned char *field, *val;
9318 unsigned int flen, vlen;
9319
9320 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9321 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9322 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9323 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9324 return -1;
9325 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9326 return -1;
9327 }
9328 } else {
9329 dictIterator *di = dictGetIterator(o->ptr);
9330 dictEntry *de;
9331
9332 while((de = dictNext(di)) != NULL) {
9333 robj *field = dictGetEntryKey(de);
9334 robj *val = dictGetEntryVal(de);
9335
9336 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9337 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9338 if (fwriteBulkObject(fp,field) == -1) return -1;
9339 if (fwriteBulkObject(fp,val) == -1) return -1;
9340 }
9341 dictReleaseIterator(di);
9342 }
9d65a1bb 9343 } else {
f83c6cb5 9344 redisPanic("Unknown object type");
9d65a1bb 9345 }
9346 /* Save the expire time */
9347 if (expiretime != -1) {
e96e4fbf 9348 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 9349 /* If this key is already expired skip it */
9350 if (expiretime < now) continue;
9351 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9352 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9eaef89f 9353 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9d65a1bb 9354 }
b9bc0eef 9355 if (swapped) decrRefCount(o);
9d65a1bb 9356 }
9357 dictReleaseIterator(di);
9358 }
9359
9360 /* Make sure data will not remain on the OS's output buffers */
9361 fflush(fp);
b0bd87f6 9362 aof_fsync(fileno(fp));
9d65a1bb 9363 fclose(fp);
e0a62c7f 9364
9d65a1bb 9365 /* Use RENAME to make sure the DB file is changed atomically only
9366 * if the generate DB file is ok. */
9367 if (rename(tmpfile,filename) == -1) {
9368 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9369 unlink(tmpfile);
9370 return REDIS_ERR;
9371 }
9372 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9373 return REDIS_OK;
9374
9375werr:
9376 fclose(fp);
9377 unlink(tmpfile);
e96e4fbf 9378 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 9379 if (di) dictReleaseIterator(di);
9380 return REDIS_ERR;
9381}
9382
9383/* This is how rewriting of the append only file in background works:
9384 *
9385 * 1) The user calls BGREWRITEAOF
9386 * 2) Redis calls this function, that forks():
9387 * 2a) the child rewrite the append only file in a temp file.
9388 * 2b) the parent accumulates differences in server.bgrewritebuf.
9389 * 3) When the child finished '2a' exists.
9390 * 4) The parent will trap the exit code, if it's OK, will append the
9391 * data accumulated into server.bgrewritebuf into the temp file, and
9392 * finally will rename(2) the temp file in the actual file name.
9393 * The the new file is reopened as the new append only file. Profit!
9394 */
9395static int rewriteAppendOnlyFileBackground(void) {
9396 pid_t childpid;
9397
9398 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 9399 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 9400 if ((childpid = fork()) == 0) {
9401 /* Child */
9402 char tmpfile[256];
9d65a1bb 9403
054e426d 9404 if (server.vm_enabled) vmReopenSwapFile();
9405 close(server.fd);
9d65a1bb 9406 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9407 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 9408 _exit(0);
9d65a1bb 9409 } else {
478c2c6f 9410 _exit(1);
9d65a1bb 9411 }
9412 } else {
9413 /* Parent */
9414 if (childpid == -1) {
9415 redisLog(REDIS_WARNING,
9416 "Can't rewrite append only file in background: fork: %s",
9417 strerror(errno));
9418 return REDIS_ERR;
9419 }
9420 redisLog(REDIS_NOTICE,
9421 "Background append only file rewriting started by pid %d",childpid);
9422 server.bgrewritechildpid = childpid;
884d4b39 9423 updateDictResizePolicy();
85a83172 9424 /* We set appendseldb to -1 in order to force the next call to the
9425 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9426 * accumulated by the parent into server.bgrewritebuf will start
9427 * with a SELECT statement and it will be safe to merge. */
9428 server.appendseldb = -1;
9d65a1bb 9429 return REDIS_OK;
9430 }
9431 return REDIS_OK; /* unreached */
9432}
9433
9434static void bgrewriteaofCommand(redisClient *c) {
9435 if (server.bgrewritechildpid != -1) {
9436 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9437 return;
9438 }
9439 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 9440 char *status = "+Background append only file rewriting started\r\n";
9441 addReplySds(c,sdsnew(status));
9d65a1bb 9442 } else {
9443 addReply(c,shared.err);
9444 }
9445}
9446
9447static void aofRemoveTempFile(pid_t childpid) {
9448 char tmpfile[256];
9449
9450 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9451 unlink(tmpfile);
9452}
9453
996cb5f7 9454/* Virtual Memory is composed mainly of two subsystems:
9455 * - Blocking Virutal Memory
9456 * - Threaded Virtual Memory I/O
9457 * The two parts are not fully decoupled, but functions are split among two
9458 * different sections of the source code (delimited by comments) in order to
9459 * make more clear what functionality is about the blocking VM and what about
9460 * the threaded (not blocking) VM.
9461 *
9462 * Redis VM design:
9463 *
9464 * Redis VM is a blocking VM (one that blocks reading swapped values from
9465 * disk into memory when a value swapped out is needed in memory) that is made
9466 * unblocking by trying to examine the command argument vector in order to
9467 * load in background values that will likely be needed in order to exec
9468 * the command. The command is executed only once all the relevant keys
9469 * are loaded into memory.
9470 *
9471 * This basically is almost as simple of a blocking VM, but almost as parallel
9472 * as a fully non-blocking VM.
9473 */
9474
560db612 9475/* =================== Virtual Memory - Blocking Side ====================== */
2e5eb04e 9476
560db612 9477/* Create a VM pointer object. This kind of objects are used in place of
9478 * values in the key -> value hash table, for swapped out objects. */
9479static vmpointer *createVmPointer(int vtype) {
9480 vmpointer *vp = zmalloc(sizeof(vmpointer));
2e5eb04e 9481
560db612 9482 vp->type = REDIS_VMPOINTER;
9483 vp->storage = REDIS_VM_SWAPPED;
9484 vp->vtype = vtype;
9485 return vp;
2e5eb04e 9486}
9487
75680a3c 9488static void vmInit(void) {
9489 off_t totsize;
996cb5f7 9490 int pipefds[2];
bcaa7a4f 9491 size_t stacksize;
8b5bb414 9492 struct flock fl;
75680a3c 9493
4ad37480 9494 if (server.vm_max_threads != 0)
9495 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9496
054e426d 9497 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 9498 /* Try to open the old swap file, otherwise create it */
6fa987e3 9499 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9500 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9501 }
75680a3c 9502 if (server.vm_fp == NULL) {
6fa987e3 9503 redisLog(REDIS_WARNING,
8b5bb414 9504 "Can't open the swap file: %s. Exiting.",
6fa987e3 9505 strerror(errno));
75680a3c 9506 exit(1);
9507 }
9508 server.vm_fd = fileno(server.vm_fp);
8b5bb414 9509 /* Lock the swap file for writing, this is useful in order to avoid
9510 * another instance to use the same swap file for a config error. */
9511 fl.l_type = F_WRLCK;
9512 fl.l_whence = SEEK_SET;
9513 fl.l_start = fl.l_len = 0;
9514 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9515 redisLog(REDIS_WARNING,
9516 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9517 exit(1);
9518 }
9519 /* Initialize */
75680a3c 9520 server.vm_next_page = 0;
9521 server.vm_near_pages = 0;
7d98e08c 9522 server.vm_stats_used_pages = 0;
9523 server.vm_stats_swapped_objects = 0;
9524 server.vm_stats_swapouts = 0;
9525 server.vm_stats_swapins = 0;
75680a3c 9526 totsize = server.vm_pages*server.vm_page_size;
9527 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9528 if (ftruncate(server.vm_fd,totsize) == -1) {
9529 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9530 strerror(errno));
9531 exit(1);
9532 } else {
9533 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9534 }
7d30035d 9535 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 9536 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 9537 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 9538 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 9539
996cb5f7 9540 /* Initialize threaded I/O (used by Virtual Memory) */
9541 server.io_newjobs = listCreate();
9542 server.io_processing = listCreate();
9543 server.io_processed = listCreate();
d5d55fc3 9544 server.io_ready_clients = listCreate();
92f8e882 9545 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 9546 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9547 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 9548 server.io_active_threads = 0;
996cb5f7 9549 if (pipe(pipefds) == -1) {
9550 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9551 ,strerror(errno));
9552 exit(1);
9553 }
9554 server.io_ready_pipe_read = pipefds[0];
9555 server.io_ready_pipe_write = pipefds[1];
9556 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 9557 /* LZF requires a lot of stack */
9558 pthread_attr_init(&server.io_threads_attr);
9559 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9560 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9561 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 9562 /* Listen for events in the threaded I/O pipe */
9563 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9564 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9565 oom("creating file event");
75680a3c 9566}
9567
06224fec 9568/* Mark the page as used */
9569static void vmMarkPageUsed(off_t page) {
9570 off_t byte = page/8;
9571 int bit = page&7;
970e10bb 9572 redisAssert(vmFreePage(page) == 1);
06224fec 9573 server.vm_bitmap[byte] |= 1<<bit;
9574}
9575
9576/* Mark N contiguous pages as used, with 'page' being the first. */
9577static void vmMarkPagesUsed(off_t page, off_t count) {
9578 off_t j;
9579
9580 for (j = 0; j < count; j++)
7d30035d 9581 vmMarkPageUsed(page+j);
7d98e08c 9582 server.vm_stats_used_pages += count;
7c775e09 9583 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9584 (long long)count, (long long)page);
06224fec 9585}
9586
9587/* Mark the page as free */
9588static void vmMarkPageFree(off_t page) {
9589 off_t byte = page/8;
9590 int bit = page&7;
970e10bb 9591 redisAssert(vmFreePage(page) == 0);
06224fec 9592 server.vm_bitmap[byte] &= ~(1<<bit);
9593}
9594
9595/* Mark N contiguous pages as free, with 'page' being the first. */
9596static void vmMarkPagesFree(off_t page, off_t count) {
9597 off_t j;
9598
9599 for (j = 0; j < count; j++)
7d30035d 9600 vmMarkPageFree(page+j);
7d98e08c 9601 server.vm_stats_used_pages -= count;
7c775e09 9602 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9603 (long long)count, (long long)page);
06224fec 9604}
9605
9606/* Test if the page is free */
9607static int vmFreePage(off_t page) {
9608 off_t byte = page/8;
9609 int bit = page&7;
7d30035d 9610 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 9611}
9612
9613/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 9614 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 9615 * REDIS_ERR is returned.
06224fec 9616 *
9617 * This function uses a simple algorithm: we try to allocate
9618 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9619 * again from the start of the swap file searching for free spaces.
9620 *
9621 * If it looks pretty clear that there are no free pages near our offset
9622 * we try to find less populated places doing a forward jump of
9623 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9624 * without hurry, and then we jump again and so forth...
e0a62c7f 9625 *
06224fec 9626 * This function can be improved using a free list to avoid to guess
9627 * too much, since we could collect data about freed pages.
9628 *
9629 * note: I implemented this function just after watching an episode of
9630 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9631 */
c7df85a4 9632static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 9633 off_t base, offset = 0, since_jump = 0, numfree = 0;
9634
9635 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9636 server.vm_near_pages = 0;
9637 server.vm_next_page = 0;
9638 }
9639 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9640 base = server.vm_next_page;
9641
9642 while(offset < server.vm_pages) {
9643 off_t this = base+offset;
9644
9645 /* If we overflow, restart from page zero */
9646 if (this >= server.vm_pages) {
9647 this -= server.vm_pages;
9648 if (this == 0) {
9649 /* Just overflowed, what we found on tail is no longer
9650 * interesting, as it's no longer contiguous. */
9651 numfree = 0;
9652 }
9653 }
9654 if (vmFreePage(this)) {
9655 /* This is a free page */
9656 numfree++;
9657 /* Already got N free pages? Return to the caller, with success */
9658 if (numfree == n) {
7d30035d 9659 *first = this-(n-1);
9660 server.vm_next_page = this+1;
7c775e09 9661 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9662 return REDIS_OK;
06224fec 9663 }
9664 } else {
9665 /* The current one is not a free page */
9666 numfree = 0;
9667 }
9668
9669 /* Fast-forward if the current page is not free and we already
9670 * searched enough near this place. */
9671 since_jump++;
9672 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9673 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9674 since_jump = 0;
9675 /* Note that even if we rewind after the jump, we are don't need
9676 * to make sure numfree is set to zero as we only jump *if* it
9677 * is set to zero. */
9678 } else {
9679 /* Otherwise just check the next page */
9680 offset++;
9681 }
9682 }
3a66edc7 9683 return REDIS_ERR;
9684}
9685
a5819310 9686/* Write the specified object at the specified page of the swap file */
9687static int vmWriteObjectOnSwap(robj *o, off_t page) {
9688 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9689 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9690 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9691 redisLog(REDIS_WARNING,
9ebed7cf 9692 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9693 strerror(errno));
9694 return REDIS_ERR;
9695 }
9696 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9697 fflush(server.vm_fp);
a5819310 9698 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9699 return REDIS_OK;
9700}
9701
a4798f73 9702/* Transfers the 'val' object to disk. Store all the information
9703 * a 'vmpointer' object containing all the information needed to load the
9704 * object back later is returned.
9705 *
3a66edc7 9706 * If we can't find enough contiguous empty pages to swap the object on disk
a4798f73 9707 * NULL is returned. */
560db612 9708static vmpointer *vmSwapObjectBlocking(robj *val) {
b9bc0eef 9709 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9710 off_t page;
560db612 9711 vmpointer *vp;
3a66edc7 9712
560db612 9713 assert(val->storage == REDIS_VM_MEMORY);
9714 assert(val->refcount == 1);
9715 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9716 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9717
9718 vp = createVmPointer(val->type);
9719 vp->page = page;
9720 vp->usedpages = pages;
3a66edc7 9721 decrRefCount(val); /* Deallocate the object from memory. */
9722 vmMarkPagesUsed(page,pages);
560db612 9723 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9724 (void*) val,
7d30035d 9725 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9726 server.vm_stats_swapped_objects++;
9727 server.vm_stats_swapouts++;
560db612 9728 return vp;
3a66edc7 9729}
9730
a5819310 9731static robj *vmReadObjectFromSwap(off_t page, int type) {
9732 robj *o;
3a66edc7 9733
a5819310 9734 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9735 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9736 redisLog(REDIS_WARNING,
d5d55fc3 9737 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9738 strerror(errno));
478c2c6f 9739 _exit(1);
3a66edc7 9740 }
a5819310 9741 o = rdbLoadObject(type,server.vm_fp);
9742 if (o == NULL) {
d5d55fc3 9743 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9744 _exit(1);
3a66edc7 9745 }
a5819310 9746 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9747 return o;
9748}
9749
560db612 9750/* Load the specified object from swap to memory.
a5819310 9751 * The newly allocated object is returned.
9752 *
9753 * If preview is true the unserialized object is returned to the caller but
560db612 9754 * the pages are not marked as freed, nor the vp object is freed. */
9755static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
a5819310 9756 robj *val;
9757
560db612 9758 redisAssert(vp->type == REDIS_VMPOINTER &&
9759 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9760 val = vmReadObjectFromSwap(vp->page,vp->vtype);
7e69548d 9761 if (!preview) {
560db612 9762 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9763 vmMarkPagesFree(vp->page,vp->usedpages);
9764 zfree(vp);
7d98e08c 9765 server.vm_stats_swapped_objects--;
38aba9a1 9766 } else {
560db612 9767 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
7e69548d 9768 }
7d98e08c 9769 server.vm_stats_swapins++;
3a66edc7 9770 return val;
06224fec 9771}
9772
560db612 9773/* Plain object loading, from swap to memory.
9774 *
9775 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9776 * The return value is the loaded object. */
9777static robj *vmLoadObject(robj *o) {
996cb5f7 9778 /* If we are loading the object in background, stop it, we
9779 * need to load this object synchronously ASAP. */
560db612 9780 if (o->storage == REDIS_VM_LOADING)
9781 vmCancelThreadedIOJob(o);
9782 return vmGenericLoadObject((vmpointer*)o,0);
7e69548d 9783}
9784
9785/* Just load the value on disk, without to modify the key.
9786 * This is useful when we want to perform some operation on the value
9787 * without to really bring it from swap to memory, like while saving the
9788 * dataset or rewriting the append only log. */
560db612 9789static robj *vmPreviewObject(robj *o) {
9790 return vmGenericLoadObject((vmpointer*)o,1);
7e69548d 9791}
9792
4ef8de8a 9793/* How a good candidate is this object for swapping?
9794 * The better candidate it is, the greater the returned value.
9795 *
9796 * Currently we try to perform a fast estimation of the object size in
9797 * memory, and combine it with aging informations.
9798 *
9799 * Basically swappability = idle-time * log(estimated size)
9800 *
9801 * Bigger objects are preferred over smaller objects, but not
9802 * proportionally, this is why we use the logarithm. This algorithm is
9803 * just a first try and will probably be tuned later. */
9804static double computeObjectSwappability(robj *o) {
560db612 9805 /* actual age can be >= minage, but not < minage. As we use wrapping
9806 * 21 bit clocks with minutes resolution for the LRU. */
9807 time_t minage = abs(server.lruclock - o->lru);
4ef8de8a 9808 long asize = 0;
9809 list *l;
9810 dict *d;
9811 struct dictEntry *de;
9812 int z;
9813
560db612 9814 if (minage <= 0) return 0;
4ef8de8a 9815 switch(o->type) {
9816 case REDIS_STRING:
9817 if (o->encoding != REDIS_ENCODING_RAW) {
9818 asize = sizeof(*o);
9819 } else {
9820 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9821 }
9822 break;
9823 case REDIS_LIST:
9824 l = o->ptr;
9825 listNode *ln = listFirst(l);
9826
9827 asize = sizeof(list);
9828 if (ln) {
9829 robj *ele = ln->value;
9830 long elesize;
9831
9832 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9833 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
4ef8de8a 9834 asize += (sizeof(listNode)+elesize)*listLength(l);
9835 }
9836 break;
9837 case REDIS_SET:
9838 case REDIS_ZSET:
9839 z = (o->type == REDIS_ZSET);
9840 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9841
9842 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9843 if (z) asize += sizeof(zset)-sizeof(dict);
9844 if (dictSize(d)) {
9845 long elesize;
9846 robj *ele;
9847
9848 de = dictGetRandomKey(d);
9849 ele = dictGetEntryKey(de);
9850 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9851 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
4ef8de8a 9852 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9853 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9854 }
9855 break;
a97b9060 9856 case REDIS_HASH:
9857 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9858 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9859 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9860 unsigned int klen, vlen;
9861 unsigned char *key, *val;
9862
9863 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9864 klen = 0;
9865 vlen = 0;
9866 }
9867 asize = len*(klen+vlen+3);
9868 } else if (o->encoding == REDIS_ENCODING_HT) {
9869 d = o->ptr;
9870 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9871 if (dictSize(d)) {
9872 long elesize;
9873 robj *ele;
9874
9875 de = dictGetRandomKey(d);
9876 ele = dictGetEntryKey(de);
9877 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9878 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9879 ele = dictGetEntryVal(de);
9880 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9881 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9882 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9883 }
9884 }
9885 break;
4ef8de8a 9886 }
560db612 9887 return (double)minage*log(1+asize);
4ef8de8a 9888}
9889
9890/* Try to swap an object that's a good candidate for swapping.
9891 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9892 * to swap any object at all.
9893 *
9894 * If 'usethreaded' is true, Redis will try to swap the object in background
9895 * using I/O threads. */
9896static int vmSwapOneObject(int usethreads) {
4ef8de8a 9897 int j, i;
9898 struct dictEntry *best = NULL;
9899 double best_swappability = 0;
b9bc0eef 9900 redisDb *best_db = NULL;
44262c58 9901 robj *val;
9902 sds key;
4ef8de8a 9903
9904 for (j = 0; j < server.dbnum; j++) {
9905 redisDb *db = server.db+j;
b72f6a4b 9906 /* Why maxtries is set to 100?
9907 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9908 * are swappable objects */
b0d8747d 9909 int maxtries = 100;
4ef8de8a 9910
9911 if (dictSize(db->dict) == 0) continue;
9912 for (i = 0; i < 5; i++) {
9913 dictEntry *de;
9914 double swappability;
9915
e3cadb8a 9916 if (maxtries) maxtries--;
4ef8de8a 9917 de = dictGetRandomKey(db->dict);
4ef8de8a 9918 val = dictGetEntryVal(de);
1064ef87 9919 /* Only swap objects that are currently in memory.
9920 *
560db612 9921 * Also don't swap shared objects: not a good idea in general and
9922 * we need to ensure that the main thread does not touch the
1064ef87 9923 * object while the I/O thread is using it, but we can't
9924 * control other keys without adding additional mutex. */
560db612 9925 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
e3cadb8a 9926 if (maxtries) i--; /* don't count this try */
9927 continue;
9928 }
4ef8de8a 9929 swappability = computeObjectSwappability(val);
9930 if (!best || swappability > best_swappability) {
9931 best = de;
9932 best_swappability = swappability;
b9bc0eef 9933 best_db = db;
4ef8de8a 9934 }
9935 }
9936 }
7c775e09 9937 if (best == NULL) return REDIS_ERR;
4ef8de8a 9938 key = dictGetEntryKey(best);
9939 val = dictGetEntryVal(best);
9940
e3cadb8a 9941 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
44262c58 9942 key, best_swappability);
4ef8de8a 9943
4ef8de8a 9944 /* Swap it */
a69a0c9c 9945 if (usethreads) {
4c8f2370 9946 robj *keyobj = createStringObject(key,sdslen(key));
9947 vmSwapObjectThreaded(keyobj,val,best_db);
9948 decrRefCount(keyobj);
4ef8de8a 9949 return REDIS_OK;
9950 } else {
560db612 9951 vmpointer *vp;
9952
9953 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9954 dictGetEntryVal(best) = vp;
a69a0c9c 9955 return REDIS_OK;
9956 } else {
9957 return REDIS_ERR;
9958 }
4ef8de8a 9959 }
9960}
9961
a69a0c9c 9962static int vmSwapOneObjectBlocking() {
9963 return vmSwapOneObject(0);
9964}
9965
9966static int vmSwapOneObjectThreaded() {
9967 return vmSwapOneObject(1);
9968}
9969
7e69548d 9970/* Return true if it's safe to swap out objects in a given moment.
9971 * Basically we don't want to swap objects out while there is a BGSAVE
9972 * or a BGAEOREWRITE running in backgroud. */
9973static int vmCanSwapOut(void) {
9974 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9975}
9976
996cb5f7 9977/* =================== Virtual Memory - Threaded I/O ======================= */
9978
b9bc0eef 9979static void freeIOJob(iojob *j) {
d5d55fc3 9980 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9981 j->type == REDIS_IOJOB_DO_SWAP ||
9982 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
560db612 9983 {
e4ed181d 9984 /* we fix the storage type, otherwise decrRefCount() will try to
9985 * kill the I/O thread Job (that does no longer exists). */
9986 if (j->val->storage == REDIS_VM_SWAPPING)
560db612 9987 j->val->storage = REDIS_VM_MEMORY;
b9bc0eef 9988 decrRefCount(j->val);
560db612 9989 }
9990 decrRefCount(j->key);
b9bc0eef 9991 zfree(j);
9992}
9993
996cb5f7 9994/* Every time a thread finished a Job, it writes a byte into the write side
9995 * of an unix pipe in order to "awake" the main thread, and this function
9996 * is called. */
9997static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9998 int mask)
9999{
10000 char buf[1];
b0d8747d 10001 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 10002 REDIS_NOTUSED(el);
10003 REDIS_NOTUSED(mask);
10004 REDIS_NOTUSED(privdata);
10005
10006 /* For every byte we read in the read side of the pipe, there is one
10007 * I/O job completed to process. */
10008 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 10009 iojob *j;
10010 listNode *ln;
b9bc0eef 10011 struct dictEntry *de;
10012
996cb5f7 10013 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 10014
10015 /* Get the processed element (the oldest one) */
10016 lockThreadedIO();
1064ef87 10017 assert(listLength(server.io_processed) != 0);
f6c0bba8 10018 if (toprocess == -1) {
10019 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
10020 if (toprocess <= 0) toprocess = 1;
10021 }
b9bc0eef 10022 ln = listFirst(server.io_processed);
10023 j = ln->value;
10024 listDelNode(server.io_processed,ln);
10025 unlockThreadedIO();
10026 /* If this job is marked as canceled, just ignore it */
10027 if (j->canceled) {
10028 freeIOJob(j);
10029 continue;
10030 }
10031 /* Post process it in the main thread, as there are things we
10032 * can do just here to avoid race conditions and/or invasive locks */
560db612 10033 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
44262c58 10034 de = dictFind(j->db->dict,j->key->ptr);
e4ed181d 10035 redisAssert(de != NULL);
b9bc0eef 10036 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 10037 redisDb *db;
560db612 10038 vmpointer *vp = dictGetEntryVal(de);
d5d55fc3 10039
b9bc0eef 10040 /* Key loaded, bring it at home */
560db612 10041 vmMarkPagesFree(vp->page,vp->usedpages);
b9bc0eef 10042 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
560db612 10043 (unsigned char*) j->key->ptr);
b9bc0eef 10044 server.vm_stats_swapped_objects--;
10045 server.vm_stats_swapins++;
d5d55fc3 10046 dictGetEntryVal(de) = j->val;
10047 incrRefCount(j->val);
10048 db = j->db;
d5d55fc3 10049 /* Handle clients waiting for this key to be loaded. */
560db612 10050 handleClientsBlockedOnSwappedKey(db,j->key);
10051 freeIOJob(j);
10052 zfree(vp);
b9bc0eef 10053 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10054 /* Now we know the amount of pages required to swap this object.
10055 * Let's find some space for it, and queue this task again
10056 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 10057 if (!vmCanSwapOut() ||
10058 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
10059 {
10060 /* Ooops... no space or we can't swap as there is
10061 * a fork()ed Redis trying to save stuff on disk. */
560db612 10062 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 10063 freeIOJob(j);
10064 } else {
c7df85a4 10065 /* Note that we need to mark this pages as used now,
10066 * if the job will be canceled, we'll mark them as freed
10067 * again. */
10068 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 10069 j->type = REDIS_IOJOB_DO_SWAP;
10070 lockThreadedIO();
10071 queueIOJob(j);
10072 unlockThreadedIO();
10073 }
10074 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
560db612 10075 vmpointer *vp;
b9bc0eef 10076
10077 /* Key swapped. We can finally free some memory. */
560db612 10078 if (j->val->storage != REDIS_VM_SWAPPING) {
10079 vmpointer *vp = (vmpointer*) j->id;
10080 printf("storage: %d\n",vp->storage);
10081 printf("key->name: %s\n",(char*)j->key->ptr);
6c96ba7d 10082 printf("val: %p\n",(void*)j->val);
10083 printf("val->type: %d\n",j->val->type);
10084 printf("val->ptr: %s\n",(char*)j->val->ptr);
10085 }
560db612 10086 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
10087 vp = createVmPointer(j->val->type);
10088 vp->page = j->page;
10089 vp->usedpages = j->pages;
10090 dictGetEntryVal(de) = vp;
e4ed181d 10091 /* Fix the storage otherwise decrRefCount will attempt to
10092 * remove the associated I/O job */
10093 j->val->storage = REDIS_VM_MEMORY;
560db612 10094 decrRefCount(j->val);
b9bc0eef 10095 redisLog(REDIS_DEBUG,
10096 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
560db612 10097 (unsigned char*) j->key->ptr,
b9bc0eef 10098 (unsigned long long) j->page, (unsigned long long) j->pages);
10099 server.vm_stats_swapped_objects++;
10100 server.vm_stats_swapouts++;
10101 freeIOJob(j);
f11b8647 10102 /* Put a few more swap requests in queue if we are still
10103 * out of memory */
b0d8747d 10104 if (trytoswap && vmCanSwapOut() &&
10105 zmalloc_used_memory() > server.vm_max_memory)
10106 {
f11b8647 10107 int more = 1;
10108 while(more) {
10109 lockThreadedIO();
10110 more = listLength(server.io_newjobs) <
10111 (unsigned) server.vm_max_threads;
10112 unlockThreadedIO();
10113 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 10114 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
10115 trytoswap = 0;
10116 break;
10117 }
f11b8647 10118 }
10119 }
b9bc0eef 10120 }
c953f24b 10121 processed++;
f6c0bba8 10122 if (processed == toprocess) return;
996cb5f7 10123 }
10124 if (retval < 0 && errno != EAGAIN) {
10125 redisLog(REDIS_WARNING,
10126 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
10127 strerror(errno));
10128 }
10129}
10130
10131static void lockThreadedIO(void) {
10132 pthread_mutex_lock(&server.io_mutex);
10133}
10134
10135static void unlockThreadedIO(void) {
10136 pthread_mutex_unlock(&server.io_mutex);
10137}
10138
10139/* Remove the specified object from the threaded I/O queue if still not
10140 * processed, otherwise make sure to flag it as canceled. */
10141static void vmCancelThreadedIOJob(robj *o) {
10142 list *lists[3] = {
6c96ba7d 10143 server.io_newjobs, /* 0 */
10144 server.io_processing, /* 1 */
10145 server.io_processed /* 2 */
996cb5f7 10146 };
10147 int i;
10148
10149 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 10150again:
996cb5f7 10151 lockThreadedIO();
560db612 10152 /* Search for a matching object in one of the queues */
996cb5f7 10153 for (i = 0; i < 3; i++) {
10154 listNode *ln;
c7df85a4 10155 listIter li;
996cb5f7 10156
c7df85a4 10157 listRewind(lists[i],&li);
10158 while ((ln = listNext(&li)) != NULL) {
996cb5f7 10159 iojob *job = ln->value;
10160
6c96ba7d 10161 if (job->canceled) continue; /* Skip this, already canceled. */
560db612 10162 if (job->id == o) {
dbc289ae 10163 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10164 (void*)job, (char*)job->key->ptr, job->type, i);
427a2153 10165 /* Mark the pages as free since the swap didn't happened
10166 * or happened but is now discarded. */
970e10bb 10167 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 10168 vmMarkPagesFree(job->page,job->pages);
10169 /* Cancel the job. It depends on the list the job is
10170 * living in. */
996cb5f7 10171 switch(i) {
10172 case 0: /* io_newjobs */
6c96ba7d 10173 /* If the job was yet not processed the best thing to do
996cb5f7 10174 * is to remove it from the queue at all */
6c96ba7d 10175 freeIOJob(job);
996cb5f7 10176 listDelNode(lists[i],ln);
10177 break;
10178 case 1: /* io_processing */
d5d55fc3 10179 /* Oh Shi- the thread is messing with the Job:
10180 *
10181 * Probably it's accessing the object if this is a
10182 * PREPARE_SWAP or DO_SWAP job.
10183 * If it's a LOAD job it may be reading from disk and
10184 * if we don't wait for the job to terminate before to
10185 * cancel it, maybe in a few microseconds data can be
10186 * corrupted in this pages. So the short story is:
10187 *
10188 * Better to wait for the job to move into the
10189 * next queue (processed)... */
10190
10191 /* We try again and again until the job is completed. */
10192 unlockThreadedIO();
10193 /* But let's wait some time for the I/O thread
10194 * to finish with this job. After all this condition
10195 * should be very rare. */
10196 usleep(1);
10197 goto again;
996cb5f7 10198 case 2: /* io_processed */
2e111efe 10199 /* The job was already processed, that's easy...
10200 * just mark it as canceled so that we'll ignore it
10201 * when processing completed jobs. */
996cb5f7 10202 job->canceled = 1;
10203 break;
10204 }
c7df85a4 10205 /* Finally we have to adjust the storage type of the object
10206 * in order to "UNDO" the operaiton. */
996cb5f7 10207 if (o->storage == REDIS_VM_LOADING)
10208 o->storage = REDIS_VM_SWAPPED;
10209 else if (o->storage == REDIS_VM_SWAPPING)
10210 o->storage = REDIS_VM_MEMORY;
10211 unlockThreadedIO();
e4ed181d 10212 redisLog(REDIS_DEBUG,"*** DONE");
996cb5f7 10213 return;
10214 }
10215 }
10216 }
10217 unlockThreadedIO();
560db612 10218 printf("Not found: %p\n", (void*)o);
10219 redisAssert(1 != 1); /* We should never reach this */
996cb5f7 10220}
10221
b9bc0eef 10222static void *IOThreadEntryPoint(void *arg) {
10223 iojob *j;
10224 listNode *ln;
10225 REDIS_NOTUSED(arg);
10226
10227 pthread_detach(pthread_self());
10228 while(1) {
10229 /* Get a new job to process */
10230 lockThreadedIO();
10231 if (listLength(server.io_newjobs) == 0) {
10232 /* No new jobs in queue, exit. */
9ebed7cf 10233 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10234 (long) pthread_self());
b9bc0eef 10235 server.io_active_threads--;
10236 unlockThreadedIO();
10237 return NULL;
10238 }
10239 ln = listFirst(server.io_newjobs);
10240 j = ln->value;
10241 listDelNode(server.io_newjobs,ln);
10242 /* Add the job in the processing queue */
10243 j->thread = pthread_self();
10244 listAddNodeTail(server.io_processing,j);
10245 ln = listLast(server.io_processing); /* We use ln later to remove it */
10246 unlockThreadedIO();
9ebed7cf 10247 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10248 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 10249
10250 /* Process the Job */
10251 if (j->type == REDIS_IOJOB_LOAD) {
560db612 10252 vmpointer *vp = (vmpointer*)j->id;
10253 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
b9bc0eef 10254 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10255 FILE *fp = fopen("/dev/null","w+");
10256 j->pages = rdbSavedObjectPages(j->val,fp);
10257 fclose(fp);
10258 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 10259 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10260 j->canceled = 1;
b9bc0eef 10261 }
10262
10263 /* Done: insert the job into the processed queue */
9ebed7cf 10264 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10265 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 10266 lockThreadedIO();
10267 listDelNode(server.io_processing,ln);
10268 listAddNodeTail(server.io_processed,j);
10269 unlockThreadedIO();
e0a62c7f 10270
b9bc0eef 10271 /* Signal the main thread there is new stuff to process */
10272 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10273 }
10274 return NULL; /* never reached */
10275}
10276
10277static void spawnIOThread(void) {
10278 pthread_t thread;
478c2c6f 10279 sigset_t mask, omask;
a97b9060 10280 int err;
b9bc0eef 10281
478c2c6f 10282 sigemptyset(&mask);
10283 sigaddset(&mask,SIGCHLD);
10284 sigaddset(&mask,SIGHUP);
10285 sigaddset(&mask,SIGPIPE);
10286 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 10287 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10288 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10289 strerror(err));
10290 usleep(1000000);
10291 }
478c2c6f 10292 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 10293 server.io_active_threads++;
10294}
10295
4ee9488d 10296/* We need to wait for the last thread to exit before we are able to
10297 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 10298static void waitEmptyIOJobsQueue(void) {
4ee9488d 10299 while(1) {
76b7233a 10300 int io_processed_len;
10301
4ee9488d 10302 lockThreadedIO();
054e426d 10303 if (listLength(server.io_newjobs) == 0 &&
10304 listLength(server.io_processing) == 0 &&
10305 server.io_active_threads == 0)
10306 {
4ee9488d 10307 unlockThreadedIO();
10308 return;
10309 }
76b7233a 10310 /* While waiting for empty jobs queue condition we post-process some
10311 * finshed job, as I/O threads may be hanging trying to write against
10312 * the io_ready_pipe_write FD but there are so much pending jobs that
10313 * it's blocking. */
10314 io_processed_len = listLength(server.io_processed);
4ee9488d 10315 unlockThreadedIO();
76b7233a 10316 if (io_processed_len) {
10317 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10318 usleep(1000); /* 1 millisecond */
10319 } else {
10320 usleep(10000); /* 10 milliseconds */
10321 }
4ee9488d 10322 }
10323}
10324
054e426d 10325static void vmReopenSwapFile(void) {
478c2c6f 10326 /* Note: we don't close the old one as we are in the child process
10327 * and don't want to mess at all with the original file object. */
054e426d 10328 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10329 if (server.vm_fp == NULL) {
10330 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10331 server.vm_swap_file);
478c2c6f 10332 _exit(1);
054e426d 10333 }
10334 server.vm_fd = fileno(server.vm_fp);
10335}
10336
b9bc0eef 10337/* This function must be called while with threaded IO locked */
10338static void queueIOJob(iojob *j) {
6c96ba7d 10339 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10340 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 10341 listAddNodeTail(server.io_newjobs,j);
10342 if (server.io_active_threads < server.vm_max_threads)
10343 spawnIOThread();
10344}
10345
10346static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10347 iojob *j;
e0a62c7f 10348
b9bc0eef 10349 j = zmalloc(sizeof(*j));
10350 j->type = REDIS_IOJOB_PREPARE_SWAP;
10351 j->db = db;
78ebe4c8 10352 j->key = key;
7dd8e7cf 10353 incrRefCount(key);
560db612 10354 j->id = j->val = val;
b9bc0eef 10355 incrRefCount(val);
10356 j->canceled = 0;
10357 j->thread = (pthread_t) -1;
560db612 10358 val->storage = REDIS_VM_SWAPPING;
b9bc0eef 10359
10360 lockThreadedIO();
10361 queueIOJob(j);
10362 unlockThreadedIO();
10363 return REDIS_OK;
10364}
10365
b0d8747d 10366/* ============ Virtual Memory - Blocking clients on missing keys =========== */
10367
d5d55fc3 10368/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10369 * If there is not already a job loading the key, it is craeted.
10370 * The key is added to the io_keys list in the client structure, and also
10371 * in the hash table mapping swapped keys to waiting clients, that is,
10372 * server.io_waited_keys. */
10373static int waitForSwappedKey(redisClient *c, robj *key) {
10374 struct dictEntry *de;
10375 robj *o;
10376 list *l;
10377
10378 /* If the key does not exist or is already in RAM we don't need to
10379 * block the client at all. */
09241813 10380 de = dictFind(c->db->dict,key->ptr);
d5d55fc3 10381 if (de == NULL) return 0;
560db612 10382 o = dictGetEntryVal(de);
d5d55fc3 10383 if (o->storage == REDIS_VM_MEMORY) {
10384 return 0;
10385 } else if (o->storage == REDIS_VM_SWAPPING) {
10386 /* We were swapping the key, undo it! */
10387 vmCancelThreadedIOJob(o);
10388 return 0;
10389 }
e0a62c7f 10390
d5d55fc3 10391 /* OK: the key is either swapped, or being loaded just now. */
10392
10393 /* Add the key to the list of keys this client is waiting for.
10394 * This maps clients to keys they are waiting for. */
10395 listAddNodeTail(c->io_keys,key);
10396 incrRefCount(key);
10397
10398 /* Add the client to the swapped keys => clients waiting map. */
10399 de = dictFind(c->db->io_keys,key);
10400 if (de == NULL) {
10401 int retval;
10402
10403 /* For every key we take a list of clients blocked for it */
10404 l = listCreate();
10405 retval = dictAdd(c->db->io_keys,key,l);
10406 incrRefCount(key);
10407 assert(retval == DICT_OK);
10408 } else {
10409 l = dictGetEntryVal(de);
10410 }
10411 listAddNodeTail(l,c);
10412
10413 /* Are we already loading the key from disk? If not create a job */
10414 if (o->storage == REDIS_VM_SWAPPED) {
10415 iojob *j;
560db612 10416 vmpointer *vp = (vmpointer*)o;
d5d55fc3 10417
10418 o->storage = REDIS_VM_LOADING;
10419 j = zmalloc(sizeof(*j));
10420 j->type = REDIS_IOJOB_LOAD;
10421 j->db = c->db;
560db612 10422 j->id = (robj*)vp;
10423 j->key = key;
10424 incrRefCount(key);
10425 j->page = vp->page;
d5d55fc3 10426 j->val = NULL;
10427 j->canceled = 0;
10428 j->thread = (pthread_t) -1;
10429 lockThreadedIO();
10430 queueIOJob(j);
10431 unlockThreadedIO();
10432 }
10433 return 1;
10434}
10435
6f078746
PN
10436/* Preload keys for any command with first, last and step values for
10437 * the command keys prototype, as defined in the command table. */
10438static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10439 int j, last;
10440 if (cmd->vm_firstkey == 0) return;
10441 last = cmd->vm_lastkey;
10442 if (last < 0) last = argc+last;
10443 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10444 redisAssert(j < argc);
10445 waitForSwappedKey(c,argv[j]);
10446 }
10447}
10448
5d373da9 10449/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
10450 * Note that the number of keys to preload is user-defined, so we need to
10451 * apply a sanity check against argc. */
ca1788b5 10452static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 10453 int i, num;
ca1788b5 10454 REDIS_NOTUSED(cmd);
ca1788b5
PN
10455
10456 num = atoi(argv[2]->ptr);
739ba0d2 10457 if (num > (argc-3)) return;
76583ea4 10458 for (i = 0; i < num; i++) {
ca1788b5 10459 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
10460 }
10461}
10462
3805e04f
PN
10463/* Preload keys needed to execute the entire MULTI/EXEC block.
10464 *
10465 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10466 * and will block the client when any command requires a swapped out value. */
10467static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10468 int i, margc;
10469 struct redisCommand *mcmd;
10470 robj **margv;
10471 REDIS_NOTUSED(cmd);
10472 REDIS_NOTUSED(argc);
10473 REDIS_NOTUSED(argv);
10474
10475 if (!(c->flags & REDIS_MULTI)) return;
10476 for (i = 0; i < c->mstate.count; i++) {
10477 mcmd = c->mstate.commands[i].cmd;
10478 margc = c->mstate.commands[i].argc;
10479 margv = c->mstate.commands[i].argv;
10480
10481 if (mcmd->vm_preload_proc != NULL) {
10482 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10483 } else {
10484 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10485 }
76583ea4
PN
10486 }
10487}
10488
b0d8747d 10489/* Is this client attempting to run a command against swapped keys?
d5d55fc3 10490 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 10491 *
d5d55fc3 10492 * The important idea about this function is that it can fail! If keys will
10493 * still be swapped when the client is resumed, this key lookups will
10494 * just block loading keys from disk. In practical terms this should only
10495 * happen with SORT BY command or if there is a bug in this function.
10496 *
10497 * Return 1 if the client is marked as blocked, 0 if the client can
10498 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 10499static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 10500 if (cmd->vm_preload_proc != NULL) {
ca1788b5 10501 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 10502 } else {
6f078746 10503 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
10504 }
10505
d5d55fc3 10506 /* If the client was blocked for at least one key, mark it as blocked. */
10507 if (listLength(c->io_keys)) {
10508 c->flags |= REDIS_IO_WAIT;
10509 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10510 server.vm_blocked_clients++;
10511 return 1;
10512 } else {
10513 return 0;
10514 }
10515}
10516
10517/* Remove the 'key' from the list of blocked keys for a given client.
10518 *
10519 * The function returns 1 when there are no longer blocking keys after
10520 * the current one was removed (and the client can be unblocked). */
10521static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10522 list *l;
10523 listNode *ln;
10524 listIter li;
10525 struct dictEntry *de;
10526
10527 /* Remove the key from the list of keys this client is waiting for. */
10528 listRewind(c->io_keys,&li);
10529 while ((ln = listNext(&li)) != NULL) {
bf028098 10530 if (equalStringObjects(ln->value,key)) {
d5d55fc3 10531 listDelNode(c->io_keys,ln);
10532 break;
10533 }
10534 }
10535 assert(ln != NULL);
10536
10537 /* Remove the client form the key => waiting clients map. */
10538 de = dictFind(c->db->io_keys,key);
10539 assert(de != NULL);
10540 l = dictGetEntryVal(de);
10541 ln = listSearchKey(l,c);
10542 assert(ln != NULL);
10543 listDelNode(l,ln);
10544 if (listLength(l) == 0)
10545 dictDelete(c->db->io_keys,key);
10546
10547 return listLength(c->io_keys) == 0;
10548}
10549
560db612 10550/* Every time we now a key was loaded back in memory, we handle clients
10551 * waiting for this key if any. */
d5d55fc3 10552static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10553 struct dictEntry *de;
10554 list *l;
10555 listNode *ln;
10556 int len;
10557
10558 de = dictFind(db->io_keys,key);
10559 if (!de) return;
10560
10561 l = dictGetEntryVal(de);
10562 len = listLength(l);
10563 /* Note: we can't use something like while(listLength(l)) as the list
10564 * can be freed by the calling function when we remove the last element. */
10565 while (len--) {
10566 ln = listFirst(l);
10567 redisClient *c = ln->value;
10568
10569 if (dontWaitForSwappedKey(c,key)) {
10570 /* Put the client in the list of clients ready to go as we
10571 * loaded all the keys about it. */
10572 listAddNodeTail(server.io_ready_clients,c);
10573 }
10574 }
b0d8747d 10575}
b0d8747d 10576
500ece7c 10577/* =========================== Remote Configuration ========================= */
10578
10579static void configSetCommand(redisClient *c) {
10580 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 10581 long long ll;
10582
500ece7c 10583 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10584 zfree(server.dbfilename);
10585 server.dbfilename = zstrdup(o->ptr);
10586 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10587 zfree(server.requirepass);
10588 server.requirepass = zstrdup(o->ptr);
10589 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10590 zfree(server.masterauth);
10591 server.masterauth = zstrdup(o->ptr);
10592 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 10593 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10594 ll < 0) goto badfmt;
10595 server.maxmemory = ll;
10596 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10597 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10598 ll < 0 || ll > LONG_MAX) goto badfmt;
10599 server.maxidletime = ll;
1b677732 10600 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10601 if (!strcasecmp(o->ptr,"no")) {
10602 server.appendfsync = APPENDFSYNC_NO;
10603 } else if (!strcasecmp(o->ptr,"everysec")) {
10604 server.appendfsync = APPENDFSYNC_EVERYSEC;
10605 } else if (!strcasecmp(o->ptr,"always")) {
10606 server.appendfsync = APPENDFSYNC_ALWAYS;
10607 } else {
10608 goto badfmt;
10609 }
38db9171 10610 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10611 int yn = yesnotoi(o->ptr);
10612
10613 if (yn == -1) goto badfmt;
10614 server.no_appendfsync_on_rewrite = yn;
2e5eb04e 10615 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10616 int old = server.appendonly;
10617 int new = yesnotoi(o->ptr);
10618
10619 if (new == -1) goto badfmt;
10620 if (old != new) {
10621 if (new == 0) {
10622 stopAppendOnly();
10623 } else {
10624 if (startAppendOnly() == REDIS_ERR) {
10625 addReplySds(c,sdscatprintf(sdsempty(),
10626 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10627 decrRefCount(o);
10628 return;
10629 }
10630 }
10631 }
a34e0a25 10632 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10633 int vlen, j;
10634 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10635
10636 /* Perform sanity check before setting the new config:
10637 * - Even number of args
10638 * - Seconds >= 1, changes >= 0 */
10639 if (vlen & 1) {
10640 sdsfreesplitres(v,vlen);
10641 goto badfmt;
10642 }
10643 for (j = 0; j < vlen; j++) {
10644 char *eptr;
10645 long val;
10646
10647 val = strtoll(v[j], &eptr, 10);
10648 if (eptr[0] != '\0' ||
10649 ((j & 1) == 0 && val < 1) ||
10650 ((j & 1) == 1 && val < 0)) {
10651 sdsfreesplitres(v,vlen);
10652 goto badfmt;
10653 }
10654 }
10655 /* Finally set the new config */
10656 resetServerSaveParams();
10657 for (j = 0; j < vlen; j += 2) {
10658 time_t seconds;
10659 int changes;
10660
10661 seconds = strtoll(v[j],NULL,10);
10662 changes = strtoll(v[j+1],NULL,10);
10663 appendServerSaveParams(seconds, changes);
10664 }
10665 sdsfreesplitres(v,vlen);
500ece7c 10666 } else {
10667 addReplySds(c,sdscatprintf(sdsempty(),
10668 "-ERR not supported CONFIG parameter %s\r\n",
10669 (char*)c->argv[2]->ptr));
10670 decrRefCount(o);
10671 return;
10672 }
10673 decrRefCount(o);
10674 addReply(c,shared.ok);
a34e0a25 10675 return;
10676
10677badfmt: /* Bad format errors */
10678 addReplySds(c,sdscatprintf(sdsempty(),
10679 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10680 (char*)o->ptr,
10681 (char*)c->argv[2]->ptr));
10682 decrRefCount(o);
500ece7c 10683}
10684
10685static void configGetCommand(redisClient *c) {
10686 robj *o = getDecodedObject(c->argv[2]);
10687 robj *lenobj = createObject(REDIS_STRING,NULL);
10688 char *pattern = o->ptr;
10689 int matches = 0;
10690
10691 addReply(c,lenobj);
10692 decrRefCount(lenobj);
10693
10694 if (stringmatch(pattern,"dbfilename",0)) {
10695 addReplyBulkCString(c,"dbfilename");
10696 addReplyBulkCString(c,server.dbfilename);
10697 matches++;
10698 }
10699 if (stringmatch(pattern,"requirepass",0)) {
10700 addReplyBulkCString(c,"requirepass");
10701 addReplyBulkCString(c,server.requirepass);
10702 matches++;
10703 }
10704 if (stringmatch(pattern,"masterauth",0)) {
10705 addReplyBulkCString(c,"masterauth");
10706 addReplyBulkCString(c,server.masterauth);
10707 matches++;
10708 }
10709 if (stringmatch(pattern,"maxmemory",0)) {
10710 char buf[128];
10711
2e5eb04e 10712 ll2string(buf,128,server.maxmemory);
500ece7c 10713 addReplyBulkCString(c,"maxmemory");
10714 addReplyBulkCString(c,buf);
10715 matches++;
10716 }
2e5eb04e 10717 if (stringmatch(pattern,"timeout",0)) {
10718 char buf[128];
10719
10720 ll2string(buf,128,server.maxidletime);
10721 addReplyBulkCString(c,"timeout");
10722 addReplyBulkCString(c,buf);
10723 matches++;
10724 }
10725 if (stringmatch(pattern,"appendonly",0)) {
10726 addReplyBulkCString(c,"appendonly");
10727 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10728 matches++;
10729 }
38db9171 10730 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10731 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10732 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10733 matches++;
10734 }
1b677732 10735 if (stringmatch(pattern,"appendfsync",0)) {
10736 char *policy;
10737
10738 switch(server.appendfsync) {
10739 case APPENDFSYNC_NO: policy = "no"; break;
10740 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10741 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10742 default: policy = "unknown"; break; /* too harmless to panic */
10743 }
10744 addReplyBulkCString(c,"appendfsync");
10745 addReplyBulkCString(c,policy);
10746 matches++;
10747 }
a34e0a25 10748 if (stringmatch(pattern,"save",0)) {
10749 sds buf = sdsempty();
10750 int j;
10751
10752 for (j = 0; j < server.saveparamslen; j++) {
10753 buf = sdscatprintf(buf,"%ld %d",
10754 server.saveparams[j].seconds,
10755 server.saveparams[j].changes);
10756 if (j != server.saveparamslen-1)
10757 buf = sdscatlen(buf," ",1);
10758 }
10759 addReplyBulkCString(c,"save");
10760 addReplyBulkCString(c,buf);
10761 sdsfree(buf);
10762 matches++;
10763 }
500ece7c 10764 decrRefCount(o);
10765 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10766}
10767
10768static void configCommand(redisClient *c) {
10769 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10770 if (c->argc != 4) goto badarity;
10771 configSetCommand(c);
10772 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10773 if (c->argc != 3) goto badarity;
10774 configGetCommand(c);
10775 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10776 if (c->argc != 2) goto badarity;
10777 server.stat_numcommands = 0;
10778 server.stat_numconnections = 0;
10779 server.stat_expiredkeys = 0;
10780 server.stat_starttime = time(NULL);
10781 addReply(c,shared.ok);
10782 } else {
10783 addReplySds(c,sdscatprintf(sdsempty(),
10784 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10785 }
10786 return;
10787
10788badarity:
10789 addReplySds(c,sdscatprintf(sdsempty(),
10790 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10791 (char*) c->argv[1]->ptr));
10792}
10793
befec3cd 10794/* =========================== Pubsub implementation ======================== */
10795
ffc6b7f8 10796static void freePubsubPattern(void *p) {
10797 pubsubPattern *pat = p;
10798
10799 decrRefCount(pat->pattern);
10800 zfree(pat);
10801}
10802
10803static int listMatchPubsubPattern(void *a, void *b) {
10804 pubsubPattern *pa = a, *pb = b;
10805
10806 return (pa->client == pb->client) &&
bf028098 10807 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10808}
10809
10810/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10811 * 0 if the client was already subscribed to that channel. */
10812static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10813 struct dictEntry *de;
10814 list *clients = NULL;
10815 int retval = 0;
10816
ffc6b7f8 10817 /* Add the channel to the client -> channels hash table */
10818 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10819 retval = 1;
ffc6b7f8 10820 incrRefCount(channel);
10821 /* Add the client to the channel -> list of clients hash table */
10822 de = dictFind(server.pubsub_channels,channel);
befec3cd 10823 if (de == NULL) {
10824 clients = listCreate();
ffc6b7f8 10825 dictAdd(server.pubsub_channels,channel,clients);
10826 incrRefCount(channel);
befec3cd 10827 } else {
10828 clients = dictGetEntryVal(de);
10829 }
10830 listAddNodeTail(clients,c);
10831 }
10832 /* Notify the client */
10833 addReply(c,shared.mbulk3);
10834 addReply(c,shared.subscribebulk);
ffc6b7f8 10835 addReplyBulk(c,channel);
482b672d 10836 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10837 return retval;
10838}
10839
ffc6b7f8 10840/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10841 * 0 if the client was not subscribed to the specified channel. */
10842static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10843 struct dictEntry *de;
10844 list *clients;
10845 listNode *ln;
10846 int retval = 0;
10847
ffc6b7f8 10848 /* Remove the channel from the client -> channels hash table */
10849 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10850 we have in the hash tables. Protect it... */
ffc6b7f8 10851 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10852 retval = 1;
ffc6b7f8 10853 /* Remove the client from the channel -> clients list hash table */
10854 de = dictFind(server.pubsub_channels,channel);
befec3cd 10855 assert(de != NULL);
10856 clients = dictGetEntryVal(de);
10857 ln = listSearchKey(clients,c);
10858 assert(ln != NULL);
10859 listDelNode(clients,ln);
ff767a75 10860 if (listLength(clients) == 0) {
10861 /* Free the list and associated hash entry at all if this was
10862 * the latest client, so that it will be possible to abuse
ffc6b7f8 10863 * Redis PUBSUB creating millions of channels. */
10864 dictDelete(server.pubsub_channels,channel);
ff767a75 10865 }
befec3cd 10866 }
10867 /* Notify the client */
10868 if (notify) {
10869 addReply(c,shared.mbulk3);
10870 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10871 addReplyBulk(c,channel);
482b672d 10872 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10873 listLength(c->pubsub_patterns));
10874
10875 }
10876 decrRefCount(channel); /* it is finally safe to release it */
10877 return retval;
10878}
10879
10880/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10881static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10882 int retval = 0;
10883
10884 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10885 retval = 1;
10886 pubsubPattern *pat;
10887 listAddNodeTail(c->pubsub_patterns,pattern);
10888 incrRefCount(pattern);
10889 pat = zmalloc(sizeof(*pat));
10890 pat->pattern = getDecodedObject(pattern);
10891 pat->client = c;
10892 listAddNodeTail(server.pubsub_patterns,pat);
10893 }
10894 /* Notify the client */
10895 addReply(c,shared.mbulk3);
10896 addReply(c,shared.psubscribebulk);
10897 addReplyBulk(c,pattern);
482b672d 10898 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10899 return retval;
10900}
10901
10902/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10903 * 0 if the client was not subscribed to the specified channel. */
10904static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10905 listNode *ln;
10906 pubsubPattern pat;
10907 int retval = 0;
10908
10909 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10910 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10911 retval = 1;
10912 listDelNode(c->pubsub_patterns,ln);
10913 pat.client = c;
10914 pat.pattern = pattern;
10915 ln = listSearchKey(server.pubsub_patterns,&pat);
10916 listDelNode(server.pubsub_patterns,ln);
10917 }
10918 /* Notify the client */
10919 if (notify) {
10920 addReply(c,shared.mbulk3);
10921 addReply(c,shared.punsubscribebulk);
10922 addReplyBulk(c,pattern);
482b672d 10923 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10924 listLength(c->pubsub_patterns));
befec3cd 10925 }
ffc6b7f8 10926 decrRefCount(pattern);
befec3cd 10927 return retval;
10928}
10929
ffc6b7f8 10930/* Unsubscribe from all the channels. Return the number of channels the
10931 * client was subscribed from. */
10932static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10933 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10934 dictEntry *de;
10935 int count = 0;
10936
10937 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10938 robj *channel = dictGetEntryKey(de);
befec3cd 10939
ffc6b7f8 10940 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10941 }
10942 dictReleaseIterator(di);
10943 return count;
10944}
10945
ffc6b7f8 10946/* Unsubscribe from all the patterns. Return the number of patterns the
10947 * client was subscribed from. */
10948static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10949 listNode *ln;
10950 listIter li;
10951 int count = 0;
10952
10953 listRewind(c->pubsub_patterns,&li);
10954 while ((ln = listNext(&li)) != NULL) {
10955 robj *pattern = ln->value;
10956
10957 count += pubsubUnsubscribePattern(c,pattern,notify);
10958 }
10959 return count;
10960}
10961
befec3cd 10962/* Publish a message */
ffc6b7f8 10963static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10964 int receivers = 0;
10965 struct dictEntry *de;
ffc6b7f8 10966 listNode *ln;
10967 listIter li;
befec3cd 10968
ffc6b7f8 10969 /* Send to clients listening for that channel */
10970 de = dictFind(server.pubsub_channels,channel);
befec3cd 10971 if (de) {
10972 list *list = dictGetEntryVal(de);
10973 listNode *ln;
10974 listIter li;
10975
10976 listRewind(list,&li);
10977 while ((ln = listNext(&li)) != NULL) {
10978 redisClient *c = ln->value;
10979
10980 addReply(c,shared.mbulk3);
10981 addReply(c,shared.messagebulk);
ffc6b7f8 10982 addReplyBulk(c,channel);
befec3cd 10983 addReplyBulk(c,message);
10984 receivers++;
10985 }
10986 }
ffc6b7f8 10987 /* Send to clients listening to matching channels */
10988 if (listLength(server.pubsub_patterns)) {
10989 listRewind(server.pubsub_patterns,&li);
10990 channel = getDecodedObject(channel);
10991 while ((ln = listNext(&li)) != NULL) {
10992 pubsubPattern *pat = ln->value;
10993
10994 if (stringmatchlen((char*)pat->pattern->ptr,
10995 sdslen(pat->pattern->ptr),
10996 (char*)channel->ptr,
10997 sdslen(channel->ptr),0)) {
c8d0ea0e 10998 addReply(pat->client,shared.mbulk4);
10999 addReply(pat->client,shared.pmessagebulk);
11000 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 11001 addReplyBulk(pat->client,channel);
11002 addReplyBulk(pat->client,message);
11003 receivers++;
11004 }
11005 }
11006 decrRefCount(channel);
11007 }
befec3cd 11008 return receivers;
11009}
11010
11011static void subscribeCommand(redisClient *c) {
11012 int j;
11013
11014 for (j = 1; j < c->argc; j++)
ffc6b7f8 11015 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 11016}
11017
11018static void unsubscribeCommand(redisClient *c) {
11019 if (c->argc == 1) {
ffc6b7f8 11020 pubsubUnsubscribeAllChannels(c,1);
11021 return;
11022 } else {
11023 int j;
11024
11025 for (j = 1; j < c->argc; j++)
11026 pubsubUnsubscribeChannel(c,c->argv[j],1);
11027 }
11028}
11029
11030static void psubscribeCommand(redisClient *c) {
11031 int j;
11032
11033 for (j = 1; j < c->argc; j++)
11034 pubsubSubscribePattern(c,c->argv[j]);
11035}
11036
11037static void punsubscribeCommand(redisClient *c) {
11038 if (c->argc == 1) {
11039 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 11040 return;
11041 } else {
11042 int j;
11043
11044 for (j = 1; j < c->argc; j++)
ffc6b7f8 11045 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 11046 }
11047}
11048
11049static void publishCommand(redisClient *c) {
11050 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 11051 addReplyLongLong(c,receivers);
befec3cd 11052}
11053
37ab76c9 11054/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
11055 *
11056 * The implementation uses a per-DB hash table mapping keys to list of clients
11057 * WATCHing those keys, so that given a key that is going to be modified
11058 * we can mark all the associated clients as dirty.
11059 *
11060 * Also every client contains a list of WATCHed keys so that's possible to
11061 * un-watch such keys when the client is freed or when UNWATCH is called. */
11062
11063/* In the client->watched_keys list we need to use watchedKey structures
11064 * as in order to identify a key in Redis we need both the key name and the
11065 * DB */
11066typedef struct watchedKey {
11067 robj *key;
11068 redisDb *db;
11069} watchedKey;
11070
11071/* Watch for the specified key */
11072static void watchForKey(redisClient *c, robj *key) {
11073 list *clients = NULL;
11074 listIter li;
11075 listNode *ln;
11076 watchedKey *wk;
11077
11078 /* Check if we are already watching for this key */
11079 listRewind(c->watched_keys,&li);
11080 while((ln = listNext(&li))) {
11081 wk = listNodeValue(ln);
11082 if (wk->db == c->db && equalStringObjects(key,wk->key))
11083 return; /* Key already watched */
11084 }
11085 /* This key is not already watched in this DB. Let's add it */
11086 clients = dictFetchValue(c->db->watched_keys,key);
11087 if (!clients) {
11088 clients = listCreate();
11089 dictAdd(c->db->watched_keys,key,clients);
11090 incrRefCount(key);
11091 }
11092 listAddNodeTail(clients,c);
11093 /* Add the new key to the lits of keys watched by this client */
11094 wk = zmalloc(sizeof(*wk));
11095 wk->key = key;
11096 wk->db = c->db;
11097 incrRefCount(key);
11098 listAddNodeTail(c->watched_keys,wk);
11099}
11100
11101/* Unwatch all the keys watched by this client. To clean the EXEC dirty
11102 * flag is up to the caller. */
11103static void unwatchAllKeys(redisClient *c) {
11104 listIter li;
11105 listNode *ln;
11106
11107 if (listLength(c->watched_keys) == 0) return;
11108 listRewind(c->watched_keys,&li);
11109 while((ln = listNext(&li))) {
11110 list *clients;
11111 watchedKey *wk;
11112
11113 /* Lookup the watched key -> clients list and remove the client
11114 * from the list */
11115 wk = listNodeValue(ln);
11116 clients = dictFetchValue(wk->db->watched_keys, wk->key);
11117 assert(clients != NULL);
11118 listDelNode(clients,listSearchKey(clients,c));
11119 /* Kill the entry at all if this was the only client */
11120 if (listLength(clients) == 0)
11121 dictDelete(wk->db->watched_keys, wk->key);
11122 /* Remove this watched key from the client->watched list */
11123 listDelNode(c->watched_keys,ln);
11124 decrRefCount(wk->key);
11125 zfree(wk);
11126 }
11127}
11128
ca3f830b 11129/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 11130 * next EXEC will fail. */
11131static void touchWatchedKey(redisDb *db, robj *key) {
11132 list *clients;
11133 listIter li;
11134 listNode *ln;
11135
11136 if (dictSize(db->watched_keys) == 0) return;
11137 clients = dictFetchValue(db->watched_keys, key);
11138 if (!clients) return;
11139
11140 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
11141 /* Check if we are already watching for this key */
11142 listRewind(clients,&li);
11143 while((ln = listNext(&li))) {
11144 redisClient *c = listNodeValue(ln);
11145
11146 c->flags |= REDIS_DIRTY_CAS;
11147 }
11148}
11149
9b30e1a2 11150/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
11151 * flush but will be deleted as effect of the flushing operation should
11152 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
11153 * a FLUSHALL operation (all the DBs flushed). */
11154static void touchWatchedKeysOnFlush(int dbid) {
11155 listIter li1, li2;
11156 listNode *ln;
11157
11158 /* For every client, check all the waited keys */
11159 listRewind(server.clients,&li1);
11160 while((ln = listNext(&li1))) {
11161 redisClient *c = listNodeValue(ln);
11162 listRewind(c->watched_keys,&li2);
11163 while((ln = listNext(&li2))) {
11164 watchedKey *wk = listNodeValue(ln);
11165
11166 /* For every watched key matching the specified DB, if the
11167 * key exists, mark the client as dirty, as the key will be
11168 * removed. */
11169 if (dbid == -1 || wk->db->id == dbid) {
09241813 11170 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
9b30e1a2 11171 c->flags |= REDIS_DIRTY_CAS;
11172 }
11173 }
11174 }
11175}
11176
37ab76c9 11177static void watchCommand(redisClient *c) {
11178 int j;
11179
6531c94d 11180 if (c->flags & REDIS_MULTI) {
11181 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11182 return;
11183 }
37ab76c9 11184 for (j = 1; j < c->argc; j++)
11185 watchForKey(c,c->argv[j]);
11186 addReply(c,shared.ok);
11187}
11188
11189static void unwatchCommand(redisClient *c) {
11190 unwatchAllKeys(c);
11191 c->flags &= (~REDIS_DIRTY_CAS);
11192 addReply(c,shared.ok);
11193}
11194
7f957c92 11195/* ================================= Debugging ============================== */
11196
ba798261 11197/* Compute the sha1 of string at 's' with 'len' bytes long.
11198 * The SHA1 is then xored againt the string pointed by digest.
11199 * Since xor is commutative, this operation is used in order to
11200 * "add" digests relative to unordered elements.
11201 *
11202 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11203static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11204 SHA1_CTX ctx;
11205 unsigned char hash[20], *s = ptr;
11206 int j;
11207
11208 SHA1Init(&ctx);
11209 SHA1Update(&ctx,s,len);
11210 SHA1Final(hash,&ctx);
11211
11212 for (j = 0; j < 20; j++)
11213 digest[j] ^= hash[j];
11214}
11215
11216static void xorObjectDigest(unsigned char *digest, robj *o) {
11217 o = getDecodedObject(o);
11218 xorDigest(digest,o->ptr,sdslen(o->ptr));
11219 decrRefCount(o);
11220}
11221
11222/* This function instead of just computing the SHA1 and xoring it
11223 * against diget, also perform the digest of "digest" itself and
11224 * replace the old value with the new one.
11225 *
11226 * So the final digest will be:
11227 *
11228 * digest = SHA1(digest xor SHA1(data))
11229 *
11230 * This function is used every time we want to preserve the order so
11231 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11232 *
11233 * Also note that mixdigest("foo") followed by mixdigest("bar")
11234 * will lead to a different digest compared to "fo", "obar".
11235 */
11236static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11237 SHA1_CTX ctx;
11238 char *s = ptr;
11239
11240 xorDigest(digest,s,len);
11241 SHA1Init(&ctx);
11242 SHA1Update(&ctx,digest,20);
11243 SHA1Final(digest,&ctx);
11244}
11245
11246static void mixObjectDigest(unsigned char *digest, robj *o) {
11247 o = getDecodedObject(o);
11248 mixDigest(digest,o->ptr,sdslen(o->ptr));
11249 decrRefCount(o);
11250}
11251
11252/* Compute the dataset digest. Since keys, sets elements, hashes elements
11253 * are not ordered, we use a trick: every aggregate digest is the xor
11254 * of the digests of their elements. This way the order will not change
11255 * the result. For list instead we use a feedback entering the output digest
11256 * as input in order to ensure that a different ordered list will result in
11257 * a different digest. */
11258static void computeDatasetDigest(unsigned char *final) {
11259 unsigned char digest[20];
11260 char buf[128];
11261 dictIterator *di = NULL;
11262 dictEntry *de;
11263 int j;
11264 uint32_t aux;
11265
11266 memset(final,0,20); /* Start with a clean result */
11267
11268 for (j = 0; j < server.dbnum; j++) {
11269 redisDb *db = server.db+j;
11270
11271 if (dictSize(db->dict) == 0) continue;
11272 di = dictGetIterator(db->dict);
11273
11274 /* hash the DB id, so the same dataset moved in a different
11275 * DB will lead to a different digest */
11276 aux = htonl(j);
11277 mixDigest(final,&aux,sizeof(aux));
11278
11279 /* Iterate this DB writing every entry */
11280 while((de = dictNext(di)) != NULL) {
09241813 11281 sds key;
11282 robj *keyobj, *o;
ba798261 11283 time_t expiretime;
11284
11285 memset(digest,0,20); /* This key-val digest */
11286 key = dictGetEntryKey(de);
09241813 11287 keyobj = createStringObject(key,sdslen(key));
11288
11289 mixDigest(digest,key,sdslen(key));
11290
11291 /* Make sure the key is loaded if VM is active */
11292 o = lookupKeyRead(db,keyobj);
cbae1d34 11293
ba798261 11294 aux = htonl(o->type);
11295 mixDigest(digest,&aux,sizeof(aux));
09241813 11296 expiretime = getExpire(db,keyobj);
ba798261 11297
11298 /* Save the key and associated value */
11299 if (o->type == REDIS_STRING) {
11300 mixObjectDigest(digest,o);
11301 } else if (o->type == REDIS_LIST) {
003f0840
PN
11302 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11303 listTypeEntry entry;
11304 while(listTypeNext(li,&entry)) {
11305 robj *eleobj = listTypeGet(&entry);
ba798261 11306 mixObjectDigest(digest,eleobj);
dc845730 11307 decrRefCount(eleobj);
ba798261 11308 }
003f0840 11309 listTypeReleaseIterator(li);
ba798261 11310 } else if (o->type == REDIS_SET) {
11311 dict *set = o->ptr;
11312 dictIterator *di = dictGetIterator(set);
11313 dictEntry *de;
11314
11315 while((de = dictNext(di)) != NULL) {
11316 robj *eleobj = dictGetEntryKey(de);
11317
11318 xorObjectDigest(digest,eleobj);
11319 }
11320 dictReleaseIterator(di);
11321 } else if (o->type == REDIS_ZSET) {
11322 zset *zs = o->ptr;
11323 dictIterator *di = dictGetIterator(zs->dict);
11324 dictEntry *de;
11325
11326 while((de = dictNext(di)) != NULL) {
11327 robj *eleobj = dictGetEntryKey(de);
11328 double *score = dictGetEntryVal(de);
11329 unsigned char eledigest[20];
11330
11331 snprintf(buf,sizeof(buf),"%.17g",*score);
11332 memset(eledigest,0,20);
11333 mixObjectDigest(eledigest,eleobj);
11334 mixDigest(eledigest,buf,strlen(buf));
11335 xorDigest(digest,eledigest,20);
11336 }
11337 dictReleaseIterator(di);
11338 } else if (o->type == REDIS_HASH) {
d1578a33 11339 hashTypeIterator *hi;
ba798261 11340 robj *obj;
11341
d1578a33
PN
11342 hi = hashTypeInitIterator(o);
11343 while (hashTypeNext(hi) != REDIS_ERR) {
ba798261 11344 unsigned char eledigest[20];
11345
11346 memset(eledigest,0,20);
d1578a33 11347 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
ba798261 11348 mixObjectDigest(eledigest,obj);
11349 decrRefCount(obj);
d1578a33 11350 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
ba798261 11351 mixObjectDigest(eledigest,obj);
11352 decrRefCount(obj);
11353 xorDigest(digest,eledigest,20);
11354 }
d1578a33 11355 hashTypeReleaseIterator(hi);
ba798261 11356 } else {
11357 redisPanic("Unknown object type");
11358 }
ba798261 11359 /* If the key has an expire, add it to the mix */
11360 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11361 /* We can finally xor the key-val digest to the final digest */
11362 xorDigest(final,digest,20);
09241813 11363 decrRefCount(keyobj);
ba798261 11364 }
11365 dictReleaseIterator(di);
11366 }
11367}
11368
7f957c92 11369static void debugCommand(redisClient *c) {
11370 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11371 *((char*)-1) = 'x';
210e29f7 11372 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11373 if (rdbSave(server.dbfilename) != REDIS_OK) {
11374 addReply(c,shared.err);
11375 return;
11376 }
11377 emptyDb();
11378 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11379 addReply(c,shared.err);
11380 return;
11381 }
11382 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11383 addReply(c,shared.ok);
71c2b467 11384 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11385 emptyDb();
11386 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11387 addReply(c,shared.err);
11388 return;
11389 }
11390 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11391 addReply(c,shared.ok);
333298da 11392 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
09241813 11393 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11394 robj *val;
333298da 11395
11396 if (!de) {
11397 addReply(c,shared.nokeyerr);
11398 return;
11399 }
333298da 11400 val = dictGetEntryVal(de);
560db612 11401 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11402 val->storage == REDIS_VM_SWAPPING)) {
07efaf74 11403 char *strenc;
11404 char buf[128];
11405
11406 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11407 strenc = strencoding[val->encoding];
11408 } else {
11409 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11410 strenc = buf;
11411 }
ace06542 11412 addReplySds(c,sdscatprintf(sdsempty(),
09241813 11413 "+Value at:%p refcount:%d "
07efaf74 11414 "encoding:%s serializedlength:%lld\r\n",
09241813 11415 (void*)val, val->refcount,
07efaf74 11416 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 11417 } else {
560db612 11418 vmpointer *vp = (vmpointer*) val;
ace06542 11419 addReplySds(c,sdscatprintf(sdsempty(),
09241813 11420 "+Value swapped at: page %llu "
ace06542 11421 "using %llu pages\r\n",
09241813 11422 (unsigned long long) vp->page,
560db612 11423 (unsigned long long) vp->usedpages));
ace06542 11424 }
78ebe4c8 11425 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11426 lookupKeyRead(c->db,c->argv[2]);
11427 addReply(c,shared.ok);
7d30035d 11428 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
09241813 11429 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11430 robj *val;
560db612 11431 vmpointer *vp;
7d30035d 11432
11433 if (!server.vm_enabled) {
11434 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11435 return;
11436 }
11437 if (!de) {
11438 addReply(c,shared.nokeyerr);
11439 return;
11440 }
7d30035d 11441 val = dictGetEntryVal(de);
4ef8de8a 11442 /* Swap it */
560db612 11443 if (val->storage != REDIS_VM_MEMORY) {
7d30035d 11444 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
560db612 11445 } else if (val->refcount != 1) {
11446 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11447 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11448 dictGetEntryVal(de) = vp;
7d30035d 11449 addReply(c,shared.ok);
11450 } else {
11451 addReply(c,shared.err);
11452 }
59305dc7 11453 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11454 long keys, j;
11455 robj *key, *val;
11456 char buf[128];
11457
11458 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11459 return;
11460 for (j = 0; j < keys; j++) {
11461 snprintf(buf,sizeof(buf),"key:%lu",j);
11462 key = createStringObject(buf,strlen(buf));
11463 if (lookupKeyRead(c->db,key) != NULL) {
11464 decrRefCount(key);
11465 continue;
11466 }
11467 snprintf(buf,sizeof(buf),"value:%lu",j);
11468 val = createStringObject(buf,strlen(buf));
09241813 11469 dbAdd(c->db,key,val);
11470 decrRefCount(key);
59305dc7 11471 }
11472 addReply(c,shared.ok);
ba798261 11473 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11474 unsigned char digest[20];
11475 sds d = sdsnew("+");
11476 int j;
11477
11478 computeDatasetDigest(digest);
11479 for (j = 0; j < 20; j++)
11480 d = sdscatprintf(d, "%02x",digest[j]);
11481
11482 d = sdscatlen(d,"\r\n",2);
11483 addReplySds(c,d);
7f957c92 11484 } else {
333298da 11485 addReplySds(c,sdsnew(
bdcb92f2 11486 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 11487 }
11488}
56906eef 11489
6c96ba7d 11490static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 11491 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 11492 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 11493#ifdef HAVE_BACKTRACE
11494 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11495 *((char*)-1) = 'x';
11496#endif
11497}
11498
c651fd9e 11499static void _redisPanic(char *msg, char *file, int line) {
11500 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 11501 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 11502#ifdef HAVE_BACKTRACE
11503 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11504 *((char*)-1) = 'x';
11505#endif
11506}
11507
bcfc686d 11508/* =================================== Main! ================================ */
56906eef 11509
bcfc686d 11510#ifdef __linux__
11511int linuxOvercommitMemoryValue(void) {
11512 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11513 char buf[64];
56906eef 11514
bcfc686d 11515 if (!fp) return -1;
11516 if (fgets(buf,64,fp) == NULL) {
11517 fclose(fp);
11518 return -1;
11519 }
11520 fclose(fp);
56906eef 11521
bcfc686d 11522 return atoi(buf);
11523}
11524
11525void linuxOvercommitMemoryWarning(void) {
11526 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 11527 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 11528 }
11529}
11530#endif /* __linux__ */
11531
11532static void daemonize(void) {
11533 int fd;
11534 FILE *fp;
11535
11536 if (fork() != 0) exit(0); /* parent exits */
11537 setsid(); /* create a new session */
11538
11539 /* Every output goes to /dev/null. If Redis is daemonized but
11540 * the 'logfile' is set to 'stdout' in the configuration file
11541 * it will not log at all. */
11542 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11543 dup2(fd, STDIN_FILENO);
11544 dup2(fd, STDOUT_FILENO);
11545 dup2(fd, STDERR_FILENO);
11546 if (fd > STDERR_FILENO) close(fd);
11547 }
11548 /* Try to write the pid file */
11549 fp = fopen(server.pidfile,"w");
11550 if (fp) {
11551 fprintf(fp,"%d\n",getpid());
11552 fclose(fp);
56906eef 11553 }
56906eef 11554}
11555
42ab0172 11556static void version() {
8a3b0d2d 11557 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11558 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
11559 exit(0);
11560}
11561
723fb69b
AO
11562static void usage() {
11563 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 11564 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
11565 exit(1);
11566}
11567
bcfc686d 11568int main(int argc, char **argv) {
9651a787 11569 time_t start;
11570
bcfc686d 11571 initServerConfig();
1a132bbc 11572 sortCommandTable();
bcfc686d 11573 if (argc == 2) {
44efe66e 11574 if (strcmp(argv[1], "-v") == 0 ||
11575 strcmp(argv[1], "--version") == 0) version();
11576 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 11577 resetServerSaveParams();
11578 loadServerConfig(argv[1]);
723fb69b
AO
11579 } else if ((argc > 2)) {
11580 usage();
bcfc686d 11581 } else {
11582 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11583 }
bcfc686d 11584 if (server.daemonize) daemonize();
71c54b21 11585 initServer();
bcfc686d 11586 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11587#ifdef __linux__
11588 linuxOvercommitMemoryWarning();
11589#endif
9651a787 11590 start = time(NULL);
bcfc686d 11591 if (server.appendonly) {
11592 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 11593 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 11594 } else {
11595 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 11596 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 11597 }
bcfc686d 11598 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 11599 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 11600 aeMain(server.el);
11601 aeDeleteEventLoop(server.el);
11602 return 0;
11603}
11604
11605/* ============================= Backtrace support ========================= */
11606
11607#ifdef HAVE_BACKTRACE
11608static char *findFuncName(void *pointer, unsigned long *offset);
11609
56906eef 11610static void *getMcontextEip(ucontext_t *uc) {
11611#if defined(__FreeBSD__)
11612 return (void*) uc->uc_mcontext.mc_eip;
11613#elif defined(__dietlibc__)
11614 return (void*) uc->uc_mcontext.eip;
06db1f50 11615#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 11616 #if __x86_64__
11617 return (void*) uc->uc_mcontext->__ss.__rip;
11618 #else
56906eef 11619 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 11620 #endif
06db1f50 11621#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 11622 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 11623 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 11624 #else
11625 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 11626 #endif
54bac49d 11627#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11628 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11629#elif defined(__ia64__) /* Linux IA64 */
11630 return (void*) uc->uc_mcontext.sc_ip;
11631#else
11632 return NULL;
56906eef 11633#endif
11634}
11635
11636static void segvHandler(int sig, siginfo_t *info, void *secret) {
11637 void *trace[100];
11638 char **messages = NULL;
11639 int i, trace_size = 0;
11640 unsigned long offset=0;
56906eef 11641 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11642 sds infostring;
56906eef 11643 REDIS_NOTUSED(info);
11644
11645 redisLog(REDIS_WARNING,
11646 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11647 infostring = genRedisInfoString();
11648 redisLog(REDIS_WARNING, "%s",infostring);
11649 /* It's not safe to sdsfree() the returned string under memory
11650 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11651
56906eef 11652 trace_size = backtrace(trace, 100);
de96dbfe 11653 /* overwrite sigaction with caller's address */
b91cf5ef 11654 if (getMcontextEip(uc) != NULL) {
11655 trace[1] = getMcontextEip(uc);
11656 }
56906eef 11657 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11658
d76412d1 11659 for (i=1; i<trace_size; ++i) {
56906eef 11660 char *fn = findFuncName(trace[i], &offset), *p;
11661
11662 p = strchr(messages[i],'+');
11663 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11664 redisLog(REDIS_WARNING,"%s", messages[i]);
11665 } else {
11666 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11667 }
11668 }
b177fd30 11669 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11670 _exit(0);
fe3bbfbe 11671}
56906eef 11672
fab43727 11673static void sigtermHandler(int sig) {
11674 REDIS_NOTUSED(sig);
b58ba105 11675
fab43727 11676 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11677 server.shutdown_asap = 1;
b58ba105
AM
11678}
11679
56906eef 11680static void setupSigSegvAction(void) {
11681 struct sigaction act;
11682
11683 sigemptyset (&act.sa_mask);
11684 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11685 * is used. Otherwise, sa_handler is used */
11686 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11687 act.sa_sigaction = segvHandler;
11688 sigaction (SIGSEGV, &act, NULL);
11689 sigaction (SIGBUS, &act, NULL);
12fea928 11690 sigaction (SIGFPE, &act, NULL);
11691 sigaction (SIGILL, &act, NULL);
11692 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11693
11694 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11695 act.sa_handler = sigtermHandler;
b58ba105 11696 sigaction (SIGTERM, &act, NULL);
e65fdc78 11697 return;
56906eef 11698}
e65fdc78 11699
bcfc686d 11700#include "staticsymbols.h"
11701/* This function try to convert a pointer into a function name. It's used in
11702 * oreder to provide a backtrace under segmentation fault that's able to
11703 * display functions declared as static (otherwise the backtrace is useless). */
11704static char *findFuncName(void *pointer, unsigned long *offset){
11705 int i, ret = -1;
11706 unsigned long off, minoff = 0;
ed9b544e 11707
bcfc686d 11708 /* Try to match against the Symbol with the smallest offset */
11709 for (i=0; symsTable[i].pointer; i++) {
11710 unsigned long lp = (unsigned long) pointer;
0bc03378 11711
bcfc686d 11712 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11713 off=lp-symsTable[i].pointer;
11714 if (ret < 0 || off < minoff) {
11715 minoff=off;
11716 ret=i;
11717 }
11718 }
0bc03378 11719 }
bcfc686d 11720 if (ret == -1) return NULL;
11721 *offset = minoff;
11722 return symsTable[ret].name;
0bc03378 11723}
bcfc686d 11724#else /* HAVE_BACKTRACE */
11725static void setupSigSegvAction(void) {
0bc03378 11726}
bcfc686d 11727#endif /* HAVE_BACKTRACE */
0bc03378 11728
ed9b544e 11729
ed9b544e 11730
bcfc686d 11731/* The End */
11732
11733
ed9b544e 11734