]> git.saurik.com Git - redis.git/blame - redis.c
A first fix for SET key overwrite
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
2 * Copyright (c) 2006-2009, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
5dc70bff 30#define REDIS_VERSION "1.3.2"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
ed9b544e 41#include <signal.h>
fbf9bcdb 42
43#ifdef HAVE_BACKTRACE
c9468bcf 44#include <execinfo.h>
45#include <ucontext.h>
fbf9bcdb 46#endif /* HAVE_BACKTRACE */
47
ed9b544e 48#include <sys/wait.h>
49#include <errno.h>
50#include <assert.h>
51#include <ctype.h>
52#include <stdarg.h>
53#include <inttypes.h>
54#include <arpa/inet.h>
55#include <sys/stat.h>
56#include <fcntl.h>
57#include <sys/time.h>
58#include <sys/resource.h>
2895e862 59#include <sys/uio.h>
f78fd11b 60#include <limits.h>
a7866db6 61#include <math.h>
0bc1b2f6 62
63#if defined(__sun)
5043dff3 64#include "solarisfixes.h"
65#endif
ed9b544e 66
c9468bcf 67#include "redis.h"
ed9b544e 68#include "ae.h" /* Event driven programming library */
69#include "sds.h" /* Dynamic safe strings */
70#include "anet.h" /* Networking the easy way */
71#include "dict.h" /* Hash tables */
72#include "adlist.h" /* Linked lists */
73#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 74#include "lzf.h" /* LZF compression library */
75#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ed9b544e 76
77/* Error codes */
78#define REDIS_OK 0
79#define REDIS_ERR -1
80
81/* Static server configuration */
82#define REDIS_SERVERPORT 6379 /* TCP port */
83#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 84#define REDIS_IOBUF_LEN 1024
ed9b544e 85#define REDIS_LOADBUF_LEN 1024
93ea3759 86#define REDIS_STATIC_ARGS 4
ed9b544e 87#define REDIS_DEFAULT_DBNUM 16
88#define REDIS_CONFIGLINE_MAX 1024
89#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
90#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94754ccc 91#define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
6f376729 92#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 93#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
94
95/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
96#define REDIS_WRITEV_THRESHOLD 3
97/* Max number of iovecs used for each writev call */
98#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 99
100/* Hash table parameters */
101#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 102
103/* Command flags */
3fd78bcd 104#define REDIS_CMD_BULK 1 /* Bulk write command */
105#define REDIS_CMD_INLINE 2 /* Inline command */
106/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
107 this flags will return an error when the 'maxmemory' option is set in the
108 config file and the server is using more than maxmemory bytes of memory.
109 In short this commands are denied on low memory conditions. */
110#define REDIS_CMD_DENYOOM 4
ed9b544e 111
112/* Object types */
113#define REDIS_STRING 0
114#define REDIS_LIST 1
115#define REDIS_SET 2
1812e024 116#define REDIS_ZSET 3
117#define REDIS_HASH 4
f78fd11b 118
942a3961 119/* Objects encoding */
120#define REDIS_ENCODING_RAW 0 /* Raw representation */
121#define REDIS_ENCODING_INT 1 /* Encoded as integer */
122
f78fd11b 123/* Object types only used for dumping to disk */
bb32ede5 124#define REDIS_EXPIRETIME 253
ed9b544e 125#define REDIS_SELECTDB 254
126#define REDIS_EOF 255
127
f78fd11b 128/* Defines related to the dump file format. To store 32 bits lengths for short
129 * keys requires a lot of space, so we check the most significant 2 bits of
130 * the first byte to interpreter the length:
131 *
132 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
133 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
134 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 135 * 11|000000 this means: specially encoded object will follow. The six bits
136 * number specify the kind of object that follows.
137 * See the REDIS_RDB_ENC_* defines.
f78fd11b 138 *
10c43610 139 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
140 * values, will fit inside. */
f78fd11b 141#define REDIS_RDB_6BITLEN 0
142#define REDIS_RDB_14BITLEN 1
143#define REDIS_RDB_32BITLEN 2
17be1a4a 144#define REDIS_RDB_ENCVAL 3
f78fd11b 145#define REDIS_RDB_LENERR UINT_MAX
146
a4d1ba9a 147/* When a length of a string object stored on disk has the first two bits
148 * set, the remaining two bits specify a special encoding for the object
149 * accordingly to the following defines: */
150#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
151#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
152#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 153#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 154
75680a3c 155/* Virtual memory object->where field. */
156#define REDIS_VM_MEMORY 0 /* The object is on memory */
157#define REDIS_VM_SWAPPED 1 /* The object is on disk */
158#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
159#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
160
06224fec 161/* Virtual memory static configuration stuff.
162 * Check vmFindContiguousPages() to know more about this magic numbers. */
163#define REDIS_VM_MAX_NEAR_PAGES 65536
164#define REDIS_VM_MAX_RANDOM_JUMP 4096
165
ed9b544e 166/* Client flags */
167#define REDIS_CLOSE 1 /* This client connection should be closed ASAP */
168#define REDIS_SLAVE 2 /* This client is a slave server */
169#define REDIS_MASTER 4 /* This client is a master server */
87eca727 170#define REDIS_MONITOR 8 /* This client is a slave monitor, see MONITOR */
6e469882 171#define REDIS_MULTI 16 /* This client is in a MULTI context */
4409877e 172#define REDIS_BLOCKED 32 /* The client is waiting in a blocking operation */
ed9b544e 173
40d224a9 174/* Slave replication state - slave side */
ed9b544e 175#define REDIS_REPL_NONE 0 /* No active replication */
176#define REDIS_REPL_CONNECT 1 /* Must connect to master */
177#define REDIS_REPL_CONNECTED 2 /* Connected to master */
178
40d224a9 179/* Slave replication state - from the point of view of master
180 * Note that in SEND_BULK and ONLINE state the slave receives new updates
181 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
182 * to start the next background saving in order to send updates to it. */
183#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
184#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
185#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
186#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
187
ed9b544e 188/* List related stuff */
189#define REDIS_HEAD 0
190#define REDIS_TAIL 1
191
192/* Sort operations */
193#define REDIS_SORT_GET 0
443c6409 194#define REDIS_SORT_ASC 1
195#define REDIS_SORT_DESC 2
ed9b544e 196#define REDIS_SORTKEY_MAX 1024
197
198/* Log levels */
199#define REDIS_DEBUG 0
200#define REDIS_NOTICE 1
201#define REDIS_WARNING 2
202
203/* Anti-warning macro... */
204#define REDIS_NOTUSED(V) ((void) V)
205
6b47e12e 206#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
207#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 208
48f0308a 209/* Append only defines */
210#define APPENDFSYNC_NO 0
211#define APPENDFSYNC_ALWAYS 1
212#define APPENDFSYNC_EVERYSEC 2
213
dfc5e96c 214/* We can print the stacktrace, so our assert is defined this way: */
215#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e),exit(1)))
216static void _redisAssert(char *estr);
217
ed9b544e 218/*================================= Data types ============================== */
219
220/* A redis object, that is a type able to hold a string / list / set */
75680a3c 221
222/* The VM object structure */
223struct redisObjectVM {
3a66edc7 224 off_t page; /* the page at witch the object is stored on disk */
225 off_t usedpages; /* number of pages used on disk */
226 time_t atime; /* Last access time */
75680a3c 227} vm;
228
229/* The actual Redis Object */
ed9b544e 230typedef struct redisObject {
ed9b544e 231 void *ptr;
942a3961 232 unsigned char type;
233 unsigned char encoding;
d894161b 234 unsigned char storage; /* If this object is a key, where is the value?
235 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
236 unsigned char vtype; /* If this object is a key, and value is swapped out,
237 * this is the type of the swapped out object. */
ed9b544e 238 int refcount;
75680a3c 239 /* VM fields, this are only allocated if VM is active, otherwise the
240 * object allocation function will just allocate
241 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
242 * Redis without VM active will not have any overhead. */
243 struct redisObjectVM vm;
ed9b544e 244} robj;
245
dfc5e96c 246/* Macro used to initalize a Redis object allocated on the stack.
247 * Note that this macro is taken near the structure definition to make sure
248 * we'll update it when the structure is changed, to avoid bugs like
249 * bug #85 introduced exactly in this way. */
250#define initStaticStringObject(_var,_ptr) do { \
251 _var.refcount = 1; \
252 _var.type = REDIS_STRING; \
253 _var.encoding = REDIS_ENCODING_RAW; \
254 _var.ptr = _ptr; \
3a66edc7 255 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 256} while(0);
257
3305306f 258typedef struct redisDb {
4409877e 259 dict *dict; /* The keyspace for this DB */
260 dict *expires; /* Timeout of keys with a timeout set */
261 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
3305306f 262 int id;
263} redisDb;
264
6e469882 265/* Client MULTI/EXEC state */
266typedef struct multiCmd {
267 robj **argv;
268 int argc;
269 struct redisCommand *cmd;
270} multiCmd;
271
272typedef struct multiState {
273 multiCmd *commands; /* Array of MULTI commands */
274 int count; /* Total number of MULTI commands */
275} multiState;
276
ed9b544e 277/* With multiplexing we need to take per-clinet state.
278 * Clients are taken in a liked list. */
279typedef struct redisClient {
280 int fd;
3305306f 281 redisDb *db;
ed9b544e 282 int dictid;
283 sds querybuf;
e8a74421 284 robj **argv, **mbargv;
285 int argc, mbargc;
40d224a9 286 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 287 int multibulk; /* multi bulk command format active */
ed9b544e 288 list *reply;
289 int sentlen;
290 time_t lastinteraction; /* time of the last interaction, used for timeout */
40d224a9 291 int flags; /* REDIS_CLOSE | REDIS_SLAVE | REDIS_MONITOR */
6e469882 292 /* REDIS_MULTI */
40d224a9 293 int slaveseldb; /* slave selected db, if this client is a slave */
294 int authenticated; /* when requirepass is non-NULL */
295 int replstate; /* replication state if this is a slave */
296 int repldbfd; /* replication DB file descriptor */
6e469882 297 long repldboff; /* replication DB file offset */
40d224a9 298 off_t repldbsize; /* replication DB file size */
6e469882 299 multiState mstate; /* MULTI/EXEC state */
b177fd30 300 robj **blockingkeys; /* The key we waiting to terminate a blocking
4409877e 301 * operation such as BLPOP. Otherwise NULL. */
b177fd30 302 int blockingkeysnum; /* Number of blocking keys */
4409877e 303 time_t blockingto; /* Blocking operation timeout. If UNIX current time
304 * is >= blockingto then the operation timed out. */
ed9b544e 305} redisClient;
306
307struct saveparam {
308 time_t seconds;
309 int changes;
310};
311
312/* Global server state structure */
313struct redisServer {
314 int port;
315 int fd;
3305306f 316 redisDb *db;
4409877e 317 dict *sharingpool; /* Poll used for object sharing */
10c43610 318 unsigned int sharingpoolsize;
ed9b544e 319 long long dirty; /* changes to DB from the last save */
320 list *clients;
87eca727 321 list *slaves, *monitors;
ed9b544e 322 char neterr[ANET_ERR_LEN];
323 aeEventLoop *el;
324 int cronloops; /* number of times the cron function run */
325 list *objfreelist; /* A list of freed objects to avoid malloc() */
326 time_t lastsave; /* Unix time of last save succeeede */
5fba9f71 327 size_t usedmemory; /* Used memory in megabytes */
ed9b544e 328 /* Fields used only for stats */
329 time_t stat_starttime; /* server start time */
330 long long stat_numcommands; /* number of processed commands */
331 long long stat_numconnections; /* number of connections received */
332 /* Configuration */
333 int verbosity;
334 int glueoutputbuf;
335 int maxidletime;
336 int dbnum;
337 int daemonize;
44b38ef4 338 int appendonly;
48f0308a 339 int appendfsync;
340 time_t lastfsync;
44b38ef4 341 int appendfd;
342 int appendseldb;
ed329fcf 343 char *pidfile;
9f3c422c 344 pid_t bgsavechildpid;
9d65a1bb 345 pid_t bgrewritechildpid;
346 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 347 struct saveparam *saveparams;
348 int saveparamslen;
349 char *logfile;
350 char *bindaddr;
351 char *dbfilename;
44b38ef4 352 char *appendfilename;
abcb223e 353 char *requirepass;
10c43610 354 int shareobjects;
121f70cf 355 int rdbcompression;
ed9b544e 356 /* Replication related */
357 int isslave;
d0ccebcf 358 char *masterauth;
ed9b544e 359 char *masterhost;
360 int masterport;
40d224a9 361 redisClient *master; /* client that is master for this slave */
ed9b544e 362 int replstate;
285add55 363 unsigned int maxclients;
4ef8de8a 364 unsigned long long maxmemory;
f86a74e9 365 unsigned int blockedclients;
ed9b544e 366 /* Sort parameters - qsort_r() is only available under BSD so we
367 * have to take this state global, in order to pass it to sortCompare() */
368 int sort_desc;
369 int sort_alpha;
370 int sort_bypattern;
75680a3c 371 /* Virtual memory configuration */
372 int vm_enabled;
373 off_t vm_page_size;
374 off_t vm_pages;
4ef8de8a 375 unsigned long long vm_max_memory;
75680a3c 376 /* Virtual memory state */
377 FILE *vm_fp;
378 int vm_fd;
379 off_t vm_next_page; /* Next probably empty page */
380 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 381 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 382 time_t unixtime; /* Unix time sampled every second. */
ed9b544e 383};
384
385typedef void redisCommandProc(redisClient *c);
386struct redisCommand {
387 char *name;
388 redisCommandProc *proc;
389 int arity;
390 int flags;
391};
392
de96dbfe 393struct redisFunctionSym {
394 char *name;
56906eef 395 unsigned long pointer;
de96dbfe 396};
397
ed9b544e 398typedef struct _redisSortObject {
399 robj *obj;
400 union {
401 double score;
402 robj *cmpobj;
403 } u;
404} redisSortObject;
405
406typedef struct _redisSortOperation {
407 int type;
408 robj *pattern;
409} redisSortOperation;
410
6b47e12e 411/* ZSETs use a specialized version of Skiplists */
412
413typedef struct zskiplistNode {
414 struct zskiplistNode **forward;
e3870fab 415 struct zskiplistNode *backward;
6b47e12e 416 double score;
417 robj *obj;
418} zskiplistNode;
419
420typedef struct zskiplist {
e3870fab 421 struct zskiplistNode *header, *tail;
d13f767c 422 unsigned long length;
6b47e12e 423 int level;
424} zskiplist;
425
1812e024 426typedef struct zset {
427 dict *dict;
6b47e12e 428 zskiplist *zsl;
1812e024 429} zset;
430
6b47e12e 431/* Our shared "common" objects */
432
ed9b544e 433struct sharedObjectsStruct {
c937aa89 434 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 435 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 436 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
437 *outofrangeerr, *plus,
ed9b544e 438 *select0, *select1, *select2, *select3, *select4,
439 *select5, *select6, *select7, *select8, *select9;
440} shared;
441
a7866db6 442/* Global vars that are actally used as constants. The following double
443 * values are used for double on-disk serialization, and are initialized
444 * at runtime to avoid strange compiler optimizations. */
445
446static double R_Zero, R_PosInf, R_NegInf, R_Nan;
447
ed9b544e 448/*================================ Prototypes =============================== */
449
450static void freeStringObject(robj *o);
451static void freeListObject(robj *o);
452static void freeSetObject(robj *o);
453static void decrRefCount(void *o);
454static robj *createObject(int type, void *ptr);
455static void freeClient(redisClient *c);
f78fd11b 456static int rdbLoad(char *filename);
ed9b544e 457static void addReply(redisClient *c, robj *obj);
458static void addReplySds(redisClient *c, sds s);
459static void incrRefCount(robj *o);
f78fd11b 460static int rdbSaveBackground(char *filename);
ed9b544e 461static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 462static robj *dupStringObject(robj *o);
87eca727 463static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
44b38ef4 464static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 465static int syncWithMaster(void);
10c43610 466static robj *tryObjectSharing(robj *o);
942a3961 467static int tryObjectEncoding(robj *o);
9d65a1bb 468static robj *getDecodedObject(robj *o);
3305306f 469static int removeExpire(redisDb *db, robj *key);
470static int expireIfNeeded(redisDb *db, robj *key);
471static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 472static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 473static int deleteKey(redisDb *db, robj *key);
bb32ede5 474static time_t getExpire(redisDb *db, robj *key);
475static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 476static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 477static void freeMemoryIfNeeded(void);
de96dbfe 478static int processCommand(redisClient *c);
56906eef 479static void setupSigSegvAction(void);
a3b21203 480static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 481static void aofRemoveTempFile(pid_t childpid);
0ea663ea 482static size_t stringObjectLen(robj *o);
638e42ac 483static void processInputBuffer(redisClient *c);
6b47e12e 484static zskiplist *zslCreate(void);
fd8ccf44 485static void zslFree(zskiplist *zsl);
2b59cfdf 486static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 487static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 488static void initClientMultiState(redisClient *c);
489static void freeClientMultiState(redisClient *c);
490static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
4409877e 491static void unblockClient(redisClient *c);
492static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 493static void vmInit(void);
a35ddf12 494static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 495static robj *vmLoadObject(robj *key);
7e69548d 496static robj *vmPreviewObject(robj *key);
4ef8de8a 497static int vmSwapOneObject(void);
7e69548d 498static int vmCanSwapOut(void);
ed9b544e 499
abcb223e 500static void authCommand(redisClient *c);
ed9b544e 501static void pingCommand(redisClient *c);
502static void echoCommand(redisClient *c);
503static void setCommand(redisClient *c);
504static void setnxCommand(redisClient *c);
505static void getCommand(redisClient *c);
506static void delCommand(redisClient *c);
507static void existsCommand(redisClient *c);
508static void incrCommand(redisClient *c);
509static void decrCommand(redisClient *c);
510static void incrbyCommand(redisClient *c);
511static void decrbyCommand(redisClient *c);
512static void selectCommand(redisClient *c);
513static void randomkeyCommand(redisClient *c);
514static void keysCommand(redisClient *c);
515static void dbsizeCommand(redisClient *c);
516static void lastsaveCommand(redisClient *c);
517static void saveCommand(redisClient *c);
518static void bgsaveCommand(redisClient *c);
9d65a1bb 519static void bgrewriteaofCommand(redisClient *c);
ed9b544e 520static void shutdownCommand(redisClient *c);
521static void moveCommand(redisClient *c);
522static void renameCommand(redisClient *c);
523static void renamenxCommand(redisClient *c);
524static void lpushCommand(redisClient *c);
525static void rpushCommand(redisClient *c);
526static void lpopCommand(redisClient *c);
527static void rpopCommand(redisClient *c);
528static void llenCommand(redisClient *c);
529static void lindexCommand(redisClient *c);
530static void lrangeCommand(redisClient *c);
531static void ltrimCommand(redisClient *c);
532static void typeCommand(redisClient *c);
533static void lsetCommand(redisClient *c);
534static void saddCommand(redisClient *c);
535static void sremCommand(redisClient *c);
a4460ef4 536static void smoveCommand(redisClient *c);
ed9b544e 537static void sismemberCommand(redisClient *c);
538static void scardCommand(redisClient *c);
12fea928 539static void spopCommand(redisClient *c);
2abb95a9 540static void srandmemberCommand(redisClient *c);
ed9b544e 541static void sinterCommand(redisClient *c);
542static void sinterstoreCommand(redisClient *c);
40d224a9 543static void sunionCommand(redisClient *c);
544static void sunionstoreCommand(redisClient *c);
f4f56e1d 545static void sdiffCommand(redisClient *c);
546static void sdiffstoreCommand(redisClient *c);
ed9b544e 547static void syncCommand(redisClient *c);
548static void flushdbCommand(redisClient *c);
549static void flushallCommand(redisClient *c);
550static void sortCommand(redisClient *c);
551static void lremCommand(redisClient *c);
0f5f7e9a 552static void rpoplpushcommand(redisClient *c);
ed9b544e 553static void infoCommand(redisClient *c);
70003d28 554static void mgetCommand(redisClient *c);
87eca727 555static void monitorCommand(redisClient *c);
3305306f 556static void expireCommand(redisClient *c);
802e8373 557static void expireatCommand(redisClient *c);
f6b141c5 558static void getsetCommand(redisClient *c);
fd88489a 559static void ttlCommand(redisClient *c);
321b0e13 560static void slaveofCommand(redisClient *c);
7f957c92 561static void debugCommand(redisClient *c);
f6b141c5 562static void msetCommand(redisClient *c);
563static void msetnxCommand(redisClient *c);
fd8ccf44 564static void zaddCommand(redisClient *c);
7db723ad 565static void zincrbyCommand(redisClient *c);
cc812361 566static void zrangeCommand(redisClient *c);
50c55df5 567static void zrangebyscoreCommand(redisClient *c);
e3870fab 568static void zrevrangeCommand(redisClient *c);
3c41331e 569static void zcardCommand(redisClient *c);
1b7106e7 570static void zremCommand(redisClient *c);
6e333bbe 571static void zscoreCommand(redisClient *c);
1807985b 572static void zremrangebyscoreCommand(redisClient *c);
6e469882 573static void multiCommand(redisClient *c);
574static void execCommand(redisClient *c);
4409877e 575static void blpopCommand(redisClient *c);
576static void brpopCommand(redisClient *c);
f6b141c5 577
ed9b544e 578/*================================= Globals ================================= */
579
580/* Global vars */
581static struct redisServer server; /* server global state */
582static struct redisCommand cmdTable[] = {
583 {"get",getCommand,2,REDIS_CMD_INLINE},
3fd78bcd 584 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
585 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
5109cdff 586 {"del",delCommand,-2,REDIS_CMD_INLINE},
ed9b544e 587 {"exists",existsCommand,2,REDIS_CMD_INLINE},
3fd78bcd 588 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
589 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
70003d28 590 {"mget",mgetCommand,-2,REDIS_CMD_INLINE},
3fd78bcd 591 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
592 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 593 {"rpop",rpopCommand,2,REDIS_CMD_INLINE},
594 {"lpop",lpopCommand,2,REDIS_CMD_INLINE},
b177fd30 595 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE},
596 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE},
ed9b544e 597 {"llen",llenCommand,2,REDIS_CMD_INLINE},
598 {"lindex",lindexCommand,3,REDIS_CMD_INLINE},
3fd78bcd 599 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 600 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE},
601 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE},
602 {"lrem",lremCommand,4,REDIS_CMD_BULK},
0b13687c 603 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
3fd78bcd 604 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 605 {"srem",sremCommand,3,REDIS_CMD_BULK},
a4460ef4 606 {"smove",smoveCommand,4,REDIS_CMD_BULK},
ed9b544e 607 {"sismember",sismemberCommand,3,REDIS_CMD_BULK},
608 {"scard",scardCommand,2,REDIS_CMD_INLINE},
12fea928 609 {"spop",spopCommand,2,REDIS_CMD_INLINE},
2abb95a9 610 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE},
3fd78bcd 611 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
612 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
613 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
614 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
615 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
616 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
ed9b544e 617 {"smembers",sinterCommand,2,REDIS_CMD_INLINE},
fd8ccf44 618 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
7db723ad 619 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
1b7106e7 620 {"zrem",zremCommand,3,REDIS_CMD_BULK},
1807985b 621 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE},
752da584 622 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE},
80181f78 623 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE},
752da584 624 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE},
3c41331e 625 {"zcard",zcardCommand,2,REDIS_CMD_INLINE},
6e333bbe 626 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
3fd78bcd 627 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
628 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
f6b141c5 629 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
630 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
631 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 632 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE},
633 {"select",selectCommand,2,REDIS_CMD_INLINE},
634 {"move",moveCommand,3,REDIS_CMD_INLINE},
635 {"rename",renameCommand,3,REDIS_CMD_INLINE},
636 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE},
321b0e13 637 {"expire",expireCommand,3,REDIS_CMD_INLINE},
802e8373 638 {"expireat",expireatCommand,3,REDIS_CMD_INLINE},
ed9b544e 639 {"keys",keysCommand,2,REDIS_CMD_INLINE},
640 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE},
abcb223e 641 {"auth",authCommand,2,REDIS_CMD_INLINE},
ed9b544e 642 {"ping",pingCommand,1,REDIS_CMD_INLINE},
643 {"echo",echoCommand,2,REDIS_CMD_BULK},
644 {"save",saveCommand,1,REDIS_CMD_INLINE},
645 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE},
9d65a1bb 646 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE},
ed9b544e 647 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE},
648 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE},
649 {"type",typeCommand,2,REDIS_CMD_INLINE},
6e469882 650 {"multi",multiCommand,1,REDIS_CMD_INLINE},
651 {"exec",execCommand,1,REDIS_CMD_INLINE},
ed9b544e 652 {"sync",syncCommand,1,REDIS_CMD_INLINE},
653 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE},
654 {"flushall",flushallCommand,1,REDIS_CMD_INLINE},
3fd78bcd 655 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
ed9b544e 656 {"info",infoCommand,1,REDIS_CMD_INLINE},
87eca727 657 {"monitor",monitorCommand,1,REDIS_CMD_INLINE},
fd88489a 658 {"ttl",ttlCommand,2,REDIS_CMD_INLINE},
321b0e13 659 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE},
7f957c92 660 {"debug",debugCommand,-2,REDIS_CMD_INLINE},
ed9b544e 661 {NULL,NULL,0,0}
662};
bcfc686d 663
ed9b544e 664/*============================ Utility functions ============================ */
665
666/* Glob-style pattern matching. */
667int stringmatchlen(const char *pattern, int patternLen,
668 const char *string, int stringLen, int nocase)
669{
670 while(patternLen) {
671 switch(pattern[0]) {
672 case '*':
673 while (pattern[1] == '*') {
674 pattern++;
675 patternLen--;
676 }
677 if (patternLen == 1)
678 return 1; /* match */
679 while(stringLen) {
680 if (stringmatchlen(pattern+1, patternLen-1,
681 string, stringLen, nocase))
682 return 1; /* match */
683 string++;
684 stringLen--;
685 }
686 return 0; /* no match */
687 break;
688 case '?':
689 if (stringLen == 0)
690 return 0; /* no match */
691 string++;
692 stringLen--;
693 break;
694 case '[':
695 {
696 int not, match;
697
698 pattern++;
699 patternLen--;
700 not = pattern[0] == '^';
701 if (not) {
702 pattern++;
703 patternLen--;
704 }
705 match = 0;
706 while(1) {
707 if (pattern[0] == '\\') {
708 pattern++;
709 patternLen--;
710 if (pattern[0] == string[0])
711 match = 1;
712 } else if (pattern[0] == ']') {
713 break;
714 } else if (patternLen == 0) {
715 pattern--;
716 patternLen++;
717 break;
718 } else if (pattern[1] == '-' && patternLen >= 3) {
719 int start = pattern[0];
720 int end = pattern[2];
721 int c = string[0];
722 if (start > end) {
723 int t = start;
724 start = end;
725 end = t;
726 }
727 if (nocase) {
728 start = tolower(start);
729 end = tolower(end);
730 c = tolower(c);
731 }
732 pattern += 2;
733 patternLen -= 2;
734 if (c >= start && c <= end)
735 match = 1;
736 } else {
737 if (!nocase) {
738 if (pattern[0] == string[0])
739 match = 1;
740 } else {
741 if (tolower((int)pattern[0]) == tolower((int)string[0]))
742 match = 1;
743 }
744 }
745 pattern++;
746 patternLen--;
747 }
748 if (not)
749 match = !match;
750 if (!match)
751 return 0; /* no match */
752 string++;
753 stringLen--;
754 break;
755 }
756 case '\\':
757 if (patternLen >= 2) {
758 pattern++;
759 patternLen--;
760 }
761 /* fall through */
762 default:
763 if (!nocase) {
764 if (pattern[0] != string[0])
765 return 0; /* no match */
766 } else {
767 if (tolower((int)pattern[0]) != tolower((int)string[0]))
768 return 0; /* no match */
769 }
770 string++;
771 stringLen--;
772 break;
773 }
774 pattern++;
775 patternLen--;
776 if (stringLen == 0) {
777 while(*pattern == '*') {
778 pattern++;
779 patternLen--;
780 }
781 break;
782 }
783 }
784 if (patternLen == 0 && stringLen == 0)
785 return 1;
786 return 0;
787}
788
56906eef 789static void redisLog(int level, const char *fmt, ...) {
ed9b544e 790 va_list ap;
791 FILE *fp;
792
793 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
794 if (!fp) return;
795
796 va_start(ap, fmt);
797 if (level >= server.verbosity) {
798 char *c = ".-*";
1904ecc1 799 char buf[64];
800 time_t now;
801
802 now = time(NULL);
6c9385e0 803 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1904ecc1 804 fprintf(fp,"%s %c ",buf,c[level]);
ed9b544e 805 vfprintf(fp, fmt, ap);
806 fprintf(fp,"\n");
807 fflush(fp);
808 }
809 va_end(ap);
810
811 if (server.logfile) fclose(fp);
812}
813
814/*====================== Hash table type implementation ==================== */
815
816/* This is an hash table type that uses the SDS dynamic strings libary as
817 * keys and radis objects as values (objects can hold SDS strings,
818 * lists, sets). */
819
1812e024 820static void dictVanillaFree(void *privdata, void *val)
821{
822 DICT_NOTUSED(privdata);
823 zfree(val);
824}
825
4409877e 826static void dictListDestructor(void *privdata, void *val)
827{
828 DICT_NOTUSED(privdata);
829 listRelease((list*)val);
830}
831
ed9b544e 832static int sdsDictKeyCompare(void *privdata, const void *key1,
833 const void *key2)
834{
835 int l1,l2;
836 DICT_NOTUSED(privdata);
837
838 l1 = sdslen((sds)key1);
839 l2 = sdslen((sds)key2);
840 if (l1 != l2) return 0;
841 return memcmp(key1, key2, l1) == 0;
842}
843
844static void dictRedisObjectDestructor(void *privdata, void *val)
845{
846 DICT_NOTUSED(privdata);
847
a35ddf12 848 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 849 decrRefCount(val);
850}
851
942a3961 852static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 853 const void *key2)
854{
855 const robj *o1 = key1, *o2 = key2;
856 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
857}
858
942a3961 859static unsigned int dictObjHash(const void *key) {
ed9b544e 860 const robj *o = key;
861 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
862}
863
942a3961 864static int dictEncObjKeyCompare(void *privdata, const void *key1,
865 const void *key2)
866{
9d65a1bb 867 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
868 int cmp;
942a3961 869
9d65a1bb 870 o1 = getDecodedObject(o1);
871 o2 = getDecodedObject(o2);
872 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
873 decrRefCount(o1);
874 decrRefCount(o2);
875 return cmp;
942a3961 876}
877
878static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 879 robj *o = (robj*) key;
942a3961 880
9d65a1bb 881 o = getDecodedObject(o);
882 unsigned int hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
883 decrRefCount(o);
884 return hash;
942a3961 885}
886
ed9b544e 887static dictType setDictType = {
942a3961 888 dictEncObjHash, /* hash function */
ed9b544e 889 NULL, /* key dup */
890 NULL, /* val dup */
942a3961 891 dictEncObjKeyCompare, /* key compare */
ed9b544e 892 dictRedisObjectDestructor, /* key destructor */
893 NULL /* val destructor */
894};
895
1812e024 896static dictType zsetDictType = {
897 dictEncObjHash, /* hash function */
898 NULL, /* key dup */
899 NULL, /* val dup */
900 dictEncObjKeyCompare, /* key compare */
901 dictRedisObjectDestructor, /* key destructor */
da0a1620 902 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 903};
904
ed9b544e 905static dictType hashDictType = {
942a3961 906 dictObjHash, /* hash function */
ed9b544e 907 NULL, /* key dup */
908 NULL, /* val dup */
942a3961 909 dictObjKeyCompare, /* key compare */
ed9b544e 910 dictRedisObjectDestructor, /* key destructor */
911 dictRedisObjectDestructor /* val destructor */
912};
913
4409877e 914/* Keylist hash table type has unencoded redis objects as keys and
915 * lists as values. It's used for blocking operations (BLPOP) */
916static dictType keylistDictType = {
917 dictObjHash, /* hash function */
918 NULL, /* key dup */
919 NULL, /* val dup */
920 dictObjKeyCompare, /* key compare */
921 dictRedisObjectDestructor, /* key destructor */
922 dictListDestructor /* val destructor */
923};
924
ed9b544e 925/* ========================= Random utility functions ======================= */
926
927/* Redis generally does not try to recover from out of memory conditions
928 * when allocating objects or strings, it is not clear if it will be possible
929 * to report this condition to the client since the networking layer itself
930 * is based on heap allocation for send buffers, so we simply abort.
931 * At least the code will be simpler to read... */
932static void oom(const char *msg) {
71c54b21 933 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 934 sleep(1);
935 abort();
936}
937
938/* ====================== Redis server networking stuff ===================== */
56906eef 939static void closeTimedoutClients(void) {
ed9b544e 940 redisClient *c;
ed9b544e 941 listNode *ln;
942 time_t now = time(NULL);
943
6208b3a7 944 listRewind(server.clients);
945 while ((ln = listYield(server.clients)) != NULL) {
ed9b544e 946 c = listNodeValue(ln);
f86a74e9 947 if (server.maxidletime &&
948 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 949 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
f86a74e9 950 (now - c->lastinteraction > server.maxidletime))
951 {
ed9b544e 952 redisLog(REDIS_DEBUG,"Closing idle client");
953 freeClient(c);
f86a74e9 954 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 955 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 956 addReply(c,shared.nullmultibulk);
f86a74e9 957 unblockClient(c);
958 }
ed9b544e 959 }
960 }
ed9b544e 961}
962
12fea928 963static int htNeedsResize(dict *dict) {
964 long long size, used;
965
966 size = dictSlots(dict);
967 used = dictSize(dict);
968 return (size && used && size > DICT_HT_INITIAL_SIZE &&
969 (used*100/size < REDIS_HT_MINFILL));
970}
971
0bc03378 972/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
973 * we resize the hash table to save memory */
56906eef 974static void tryResizeHashTables(void) {
0bc03378 975 int j;
976
977 for (j = 0; j < server.dbnum; j++) {
12fea928 978 if (htNeedsResize(server.db[j].dict)) {
979 redisLog(REDIS_DEBUG,"The hash table %d is too sparse, resize it...",j);
0bc03378 980 dictResize(server.db[j].dict);
12fea928 981 redisLog(REDIS_DEBUG,"Hash table %d resized.",j);
0bc03378 982 }
12fea928 983 if (htNeedsResize(server.db[j].expires))
984 dictResize(server.db[j].expires);
0bc03378 985 }
986}
987
9d65a1bb 988/* A background saving child (BGSAVE) terminated its work. Handle this. */
989void backgroundSaveDoneHandler(int statloc) {
990 int exitcode = WEXITSTATUS(statloc);
991 int bysignal = WIFSIGNALED(statloc);
992
993 if (!bysignal && exitcode == 0) {
994 redisLog(REDIS_NOTICE,
995 "Background saving terminated with success");
996 server.dirty = 0;
997 server.lastsave = time(NULL);
998 } else if (!bysignal && exitcode != 0) {
999 redisLog(REDIS_WARNING, "Background saving error");
1000 } else {
1001 redisLog(REDIS_WARNING,
1002 "Background saving terminated by signal");
1003 rdbRemoveTempFile(server.bgsavechildpid);
1004 }
1005 server.bgsavechildpid = -1;
1006 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1007 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1008 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1009}
1010
1011/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1012 * Handle this. */
1013void backgroundRewriteDoneHandler(int statloc) {
1014 int exitcode = WEXITSTATUS(statloc);
1015 int bysignal = WIFSIGNALED(statloc);
1016
1017 if (!bysignal && exitcode == 0) {
1018 int fd;
1019 char tmpfile[256];
1020
1021 redisLog(REDIS_NOTICE,
1022 "Background append only file rewriting terminated with success");
1023 /* Now it's time to flush the differences accumulated by the parent */
1024 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1025 fd = open(tmpfile,O_WRONLY|O_APPEND);
1026 if (fd == -1) {
1027 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1028 goto cleanup;
1029 }
1030 /* Flush our data... */
1031 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1032 (signed) sdslen(server.bgrewritebuf)) {
1033 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1034 close(fd);
1035 goto cleanup;
1036 }
b32627cd 1037 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1038 /* Now our work is to rename the temp file into the stable file. And
1039 * switch the file descriptor used by the server for append only. */
1040 if (rename(tmpfile,server.appendfilename) == -1) {
1041 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1042 close(fd);
1043 goto cleanup;
1044 }
1045 /* Mission completed... almost */
1046 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1047 if (server.appendfd != -1) {
1048 /* If append only is actually enabled... */
1049 close(server.appendfd);
1050 server.appendfd = fd;
1051 fsync(fd);
85a83172 1052 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1053 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1054 } else {
1055 /* If append only is disabled we just generate a dump in this
1056 * format. Why not? */
1057 close(fd);
1058 }
1059 } else if (!bysignal && exitcode != 0) {
1060 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1061 } else {
1062 redisLog(REDIS_WARNING,
1063 "Background append only file rewriting terminated by signal");
1064 }
1065cleanup:
1066 sdsfree(server.bgrewritebuf);
1067 server.bgrewritebuf = sdsempty();
1068 aofRemoveTempFile(server.bgrewritechildpid);
1069 server.bgrewritechildpid = -1;
1070}
1071
56906eef 1072static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1073 int j, loops = server.cronloops++;
ed9b544e 1074 REDIS_NOTUSED(eventLoop);
1075 REDIS_NOTUSED(id);
1076 REDIS_NOTUSED(clientData);
1077
3a66edc7 1078 /* We take a cached value of the unix time in the global state because
1079 * with virtual memory and aging there is to store the current time
1080 * in objects at every object access, and accuracy is not needed.
1081 * To access a global var is faster than calling time(NULL) */
1082 server.unixtime = time(NULL);
1083
ed9b544e 1084 /* Update the global state with the amount of used memory */
1085 server.usedmemory = zmalloc_used_memory();
1086
0bc03378 1087 /* Show some info about non-empty databases */
ed9b544e 1088 for (j = 0; j < server.dbnum; j++) {
dec423d9 1089 long long size, used, vkeys;
94754ccc 1090
3305306f 1091 size = dictSlots(server.db[j].dict);
1092 used = dictSize(server.db[j].dict);
94754ccc 1093 vkeys = dictSize(server.db[j].expires);
c3cb078d 1094 if (!(loops % 5) && (used || vkeys)) {
1095 redisLog(REDIS_DEBUG,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1096 /* dictPrintStats(server.dict); */
ed9b544e 1097 }
ed9b544e 1098 }
1099
0bc03378 1100 /* We don't want to resize the hash tables while a bacground saving
1101 * is in progress: the saving child is created using fork() that is
1102 * implemented with a copy-on-write semantic in most modern systems, so
1103 * if we resize the HT while there is the saving child at work actually
1104 * a lot of memory movements in the parent will cause a lot of pages
1105 * copied. */
9d65a1bb 1106 if (server.bgsavechildpid == -1) tryResizeHashTables();
0bc03378 1107
ed9b544e 1108 /* Show information about connected clients */
1109 if (!(loops % 5)) {
21aecf4b 1110 redisLog(REDIS_DEBUG,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
ed9b544e 1111 listLength(server.clients)-listLength(server.slaves),
1112 listLength(server.slaves),
10c43610 1113 server.usedmemory,
3305306f 1114 dictSize(server.sharingpool));
ed9b544e 1115 }
1116
1117 /* Close connections of timedout clients */
f86a74e9 1118 if ((server.maxidletime && !(loops % 10)) || server.blockedclients)
ed9b544e 1119 closeTimedoutClients();
1120
9d65a1bb 1121 /* Check if a background saving or AOF rewrite in progress terminated */
1122 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1123 int statloc;
9d65a1bb 1124 pid_t pid;
1125
1126 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1127 if (pid == server.bgsavechildpid) {
1128 backgroundSaveDoneHandler(statloc);
ed9b544e 1129 } else {
9d65a1bb 1130 backgroundRewriteDoneHandler(statloc);
ed9b544e 1131 }
ed9b544e 1132 }
1133 } else {
1134 /* If there is not a background saving in progress check if
1135 * we have to save now */
1136 time_t now = time(NULL);
1137 for (j = 0; j < server.saveparamslen; j++) {
1138 struct saveparam *sp = server.saveparams+j;
1139
1140 if (server.dirty >= sp->changes &&
1141 now-server.lastsave > sp->seconds) {
1142 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1143 sp->changes, sp->seconds);
f78fd11b 1144 rdbSaveBackground(server.dbfilename);
ed9b544e 1145 break;
1146 }
1147 }
1148 }
94754ccc 1149
f2324293 1150 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1151 * will use few CPU cycles if there are few expiring keys, otherwise
1152 * it will get more aggressive to avoid that too much memory is used by
1153 * keys that can be removed from the keyspace. */
94754ccc 1154 for (j = 0; j < server.dbnum; j++) {
f2324293 1155 int expired;
94754ccc 1156 redisDb *db = server.db+j;
94754ccc 1157
f2324293 1158 /* Continue to expire if at the end of the cycle more than 25%
1159 * of the keys were expired. */
1160 do {
4ef8de8a 1161 long num = dictSize(db->expires);
94754ccc 1162 time_t now = time(NULL);
1163
f2324293 1164 expired = 0;
94754ccc 1165 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1166 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1167 while (num--) {
1168 dictEntry *de;
1169 time_t t;
1170
1171 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1172 t = (time_t) dictGetEntryVal(de);
1173 if (now > t) {
1174 deleteKey(db,dictGetEntryKey(de));
f2324293 1175 expired++;
94754ccc 1176 }
1177 }
f2324293 1178 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1179 }
1180
4ef8de8a 1181 /* Swap a few keys on disk if we are over the memory limit and VM
1182 * is enbled. */
7e69548d 1183 if (vmCanSwapOut()) {
1184 while (server.vm_enabled && zmalloc_used_memory() >
1185 server.vm_max_memory) {
1186 if (vmSwapOneObject() == REDIS_ERR) {
1187 if (zmalloc_used_memory() >
1188 (server.vm_max_memory+server.vm_max_memory/10)) {
1189 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1190 }
1191 break;
1192 }
4ef8de8a 1193 }
1194 }
1195
ed9b544e 1196 /* Check if we should connect to a MASTER */
1197 if (server.replstate == REDIS_REPL_CONNECT) {
1198 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1199 if (syncWithMaster() == REDIS_OK) {
1200 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1201 }
1202 }
1203 return 1000;
1204}
1205
1206static void createSharedObjects(void) {
1207 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1208 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1209 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1210 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1211 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1212 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1213 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1214 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1215 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1216 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1217 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1218 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1219 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1220 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1221 "-ERR no such key\r\n"));
ed9b544e 1222 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1223 "-ERR syntax error\r\n"));
c937aa89 1224 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1225 "-ERR source and destination objects are the same\r\n"));
1226 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1227 "-ERR index out of range\r\n"));
ed9b544e 1228 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1229 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1230 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1231 shared.select0 = createStringObject("select 0\r\n",10);
1232 shared.select1 = createStringObject("select 1\r\n",10);
1233 shared.select2 = createStringObject("select 2\r\n",10);
1234 shared.select3 = createStringObject("select 3\r\n",10);
1235 shared.select4 = createStringObject("select 4\r\n",10);
1236 shared.select5 = createStringObject("select 5\r\n",10);
1237 shared.select6 = createStringObject("select 6\r\n",10);
1238 shared.select7 = createStringObject("select 7\r\n",10);
1239 shared.select8 = createStringObject("select 8\r\n",10);
1240 shared.select9 = createStringObject("select 9\r\n",10);
1241}
1242
1243static void appendServerSaveParams(time_t seconds, int changes) {
1244 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1245 server.saveparams[server.saveparamslen].seconds = seconds;
1246 server.saveparams[server.saveparamslen].changes = changes;
1247 server.saveparamslen++;
1248}
1249
bcfc686d 1250static void resetServerSaveParams() {
ed9b544e 1251 zfree(server.saveparams);
1252 server.saveparams = NULL;
1253 server.saveparamslen = 0;
1254}
1255
1256static void initServerConfig() {
1257 server.dbnum = REDIS_DEFAULT_DBNUM;
1258 server.port = REDIS_SERVERPORT;
1259 server.verbosity = REDIS_DEBUG;
1260 server.maxidletime = REDIS_MAXIDLETIME;
1261 server.saveparams = NULL;
1262 server.logfile = NULL; /* NULL = log on standard output */
1263 server.bindaddr = NULL;
1264 server.glueoutputbuf = 1;
1265 server.daemonize = 0;
44b38ef4 1266 server.appendonly = 0;
4e141d5a 1267 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1268 server.lastfsync = time(NULL);
44b38ef4 1269 server.appendfd = -1;
1270 server.appendseldb = -1; /* Make sure the first time will not match */
ed329fcf 1271 server.pidfile = "/var/run/redis.pid";
ed9b544e 1272 server.dbfilename = "dump.rdb";
9d65a1bb 1273 server.appendfilename = "appendonly.aof";
abcb223e 1274 server.requirepass = NULL;
10c43610 1275 server.shareobjects = 0;
b0553789 1276 server.rdbcompression = 1;
21aecf4b 1277 server.sharingpoolsize = 1024;
285add55 1278 server.maxclients = 0;
f86a74e9 1279 server.blockedclients = 0;
3fd78bcd 1280 server.maxmemory = 0;
75680a3c 1281 server.vm_enabled = 0;
1282 server.vm_page_size = 256; /* 256 bytes per page */
1283 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1284 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1285
bcfc686d 1286 resetServerSaveParams();
ed9b544e 1287
1288 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1289 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1290 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1291 /* Replication related */
1292 server.isslave = 0;
d0ccebcf 1293 server.masterauth = NULL;
ed9b544e 1294 server.masterhost = NULL;
1295 server.masterport = 6379;
1296 server.master = NULL;
1297 server.replstate = REDIS_REPL_NONE;
a7866db6 1298
1299 /* Double constants initialization */
1300 R_Zero = 0.0;
1301 R_PosInf = 1.0/R_Zero;
1302 R_NegInf = -1.0/R_Zero;
1303 R_Nan = R_Zero/R_Zero;
ed9b544e 1304}
1305
1306static void initServer() {
1307 int j;
1308
1309 signal(SIGHUP, SIG_IGN);
1310 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1311 setupSigSegvAction();
ed9b544e 1312
1313 server.clients = listCreate();
1314 server.slaves = listCreate();
87eca727 1315 server.monitors = listCreate();
ed9b544e 1316 server.objfreelist = listCreate();
1317 createSharedObjects();
1318 server.el = aeCreateEventLoop();
3305306f 1319 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
10c43610 1320 server.sharingpool = dictCreate(&setDictType,NULL);
ed9b544e 1321 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1322 if (server.fd == -1) {
1323 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1324 exit(1);
1325 }
3305306f 1326 for (j = 0; j < server.dbnum; j++) {
1327 server.db[j].dict = dictCreate(&hashDictType,NULL);
1328 server.db[j].expires = dictCreate(&setDictType,NULL);
4409877e 1329 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
3305306f 1330 server.db[j].id = j;
1331 }
ed9b544e 1332 server.cronloops = 0;
9f3c422c 1333 server.bgsavechildpid = -1;
9d65a1bb 1334 server.bgrewritechildpid = -1;
1335 server.bgrewritebuf = sdsempty();
ed9b544e 1336 server.lastsave = time(NULL);
1337 server.dirty = 0;
1338 server.usedmemory = 0;
1339 server.stat_numcommands = 0;
1340 server.stat_numconnections = 0;
1341 server.stat_starttime = time(NULL);
3a66edc7 1342 server.unixtime = time(NULL);
d8f8b666 1343 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
44b38ef4 1344
1345 if (server.appendonly) {
71eba477 1346 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1347 if (server.appendfd == -1) {
1348 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1349 strerror(errno));
1350 exit(1);
1351 }
1352 }
75680a3c 1353
1354 if (server.vm_enabled) vmInit();
ed9b544e 1355}
1356
1357/* Empty the whole database */
ca37e9cd 1358static long long emptyDb() {
ed9b544e 1359 int j;
ca37e9cd 1360 long long removed = 0;
ed9b544e 1361
3305306f 1362 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1363 removed += dictSize(server.db[j].dict);
3305306f 1364 dictEmpty(server.db[j].dict);
1365 dictEmpty(server.db[j].expires);
1366 }
ca37e9cd 1367 return removed;
ed9b544e 1368}
1369
85dd2f3a 1370static int yesnotoi(char *s) {
1371 if (!strcasecmp(s,"yes")) return 1;
1372 else if (!strcasecmp(s,"no")) return 0;
1373 else return -1;
1374}
1375
ed9b544e 1376/* I agree, this is a very rudimental way to load a configuration...
1377 will improve later if the config gets more complex */
1378static void loadServerConfig(char *filename) {
c9a111ac 1379 FILE *fp;
ed9b544e 1380 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1381 int linenum = 0;
1382 sds line = NULL;
c9a111ac 1383
1384 if (filename[0] == '-' && filename[1] == '\0')
1385 fp = stdin;
1386 else {
1387 if ((fp = fopen(filename,"r")) == NULL) {
1388 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1389 exit(1);
1390 }
ed9b544e 1391 }
c9a111ac 1392
ed9b544e 1393 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1394 sds *argv;
1395 int argc, j;
1396
1397 linenum++;
1398 line = sdsnew(buf);
1399 line = sdstrim(line," \t\r\n");
1400
1401 /* Skip comments and blank lines*/
1402 if (line[0] == '#' || line[0] == '\0') {
1403 sdsfree(line);
1404 continue;
1405 }
1406
1407 /* Split into arguments */
1408 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1409 sdstolower(argv[0]);
1410
1411 /* Execute config directives */
bb0b03a3 1412 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1413 server.maxidletime = atoi(argv[1]);
0150db36 1414 if (server.maxidletime < 0) {
ed9b544e 1415 err = "Invalid timeout value"; goto loaderr;
1416 }
bb0b03a3 1417 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1418 server.port = atoi(argv[1]);
1419 if (server.port < 1 || server.port > 65535) {
1420 err = "Invalid port"; goto loaderr;
1421 }
bb0b03a3 1422 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1423 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1424 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1425 int seconds = atoi(argv[1]);
1426 int changes = atoi(argv[2]);
1427 if (seconds < 1 || changes < 0) {
1428 err = "Invalid save parameters"; goto loaderr;
1429 }
1430 appendServerSaveParams(seconds,changes);
bb0b03a3 1431 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1432 if (chdir(argv[1]) == -1) {
1433 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1434 argv[1], strerror(errno));
1435 exit(1);
1436 }
bb0b03a3 1437 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1438 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1439 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1440 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1441 else {
1442 err = "Invalid log level. Must be one of debug, notice, warning";
1443 goto loaderr;
1444 }
bb0b03a3 1445 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1446 FILE *logfp;
ed9b544e 1447
1448 server.logfile = zstrdup(argv[1]);
bb0b03a3 1449 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1450 zfree(server.logfile);
1451 server.logfile = NULL;
1452 }
1453 if (server.logfile) {
1454 /* Test if we are able to open the file. The server will not
1455 * be able to abort just for this problem later... */
c9a111ac 1456 logfp = fopen(server.logfile,"a");
1457 if (logfp == NULL) {
ed9b544e 1458 err = sdscatprintf(sdsempty(),
1459 "Can't open the log file: %s", strerror(errno));
1460 goto loaderr;
1461 }
c9a111ac 1462 fclose(logfp);
ed9b544e 1463 }
bb0b03a3 1464 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1465 server.dbnum = atoi(argv[1]);
1466 if (server.dbnum < 1) {
1467 err = "Invalid number of databases"; goto loaderr;
1468 }
285add55 1469 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1470 server.maxclients = atoi(argv[1]);
3fd78bcd 1471 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1472 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1473 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1474 server.masterhost = sdsnew(argv[1]);
1475 server.masterport = atoi(argv[2]);
1476 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1477 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1478 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1479 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1480 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1481 err = "argument must be 'yes' or 'no'"; goto loaderr;
1482 }
bb0b03a3 1483 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1484 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1485 err = "argument must be 'yes' or 'no'"; goto loaderr;
1486 }
121f70cf 1487 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1488 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1489 err = "argument must be 'yes' or 'no'"; goto loaderr;
1490 }
e52c65b9 1491 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1492 server.sharingpoolsize = atoi(argv[1]);
1493 if (server.sharingpoolsize < 1) {
1494 err = "invalid object sharing pool size"; goto loaderr;
1495 }
bb0b03a3 1496 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1497 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1498 err = "argument must be 'yes' or 'no'"; goto loaderr;
1499 }
44b38ef4 1500 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1501 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1502 err = "argument must be 'yes' or 'no'"; goto loaderr;
1503 }
48f0308a 1504 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1505 if (!strcasecmp(argv[1],"no")) {
48f0308a 1506 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1507 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1508 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1509 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1510 server.appendfsync = APPENDFSYNC_EVERYSEC;
1511 } else {
1512 err = "argument must be 'no', 'always' or 'everysec'";
1513 goto loaderr;
1514 }
bb0b03a3 1515 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
abcb223e 1516 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1517 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
ed329fcf 1518 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1519 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
b8b553c8 1520 server.dbfilename = zstrdup(argv[1]);
75680a3c 1521 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1522 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1523 err = "argument must be 'yes' or 'no'"; goto loaderr;
1524 }
4ef8de8a 1525 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1526 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1527 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1528 server.vm_page_size = strtoll(argv[1], NULL, 10);
1529 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1530 server.vm_pages = strtoll(argv[1], NULL, 10);
ed9b544e 1531 } else {
1532 err = "Bad directive or wrong number of arguments"; goto loaderr;
1533 }
1534 for (j = 0; j < argc; j++)
1535 sdsfree(argv[j]);
1536 zfree(argv);
1537 sdsfree(line);
1538 }
c9a111ac 1539 if (fp != stdin) fclose(fp);
ed9b544e 1540 return;
1541
1542loaderr:
1543 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1544 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1545 fprintf(stderr, ">>> '%s'\n", line);
1546 fprintf(stderr, "%s\n", err);
1547 exit(1);
1548}
1549
1550static void freeClientArgv(redisClient *c) {
1551 int j;
1552
1553 for (j = 0; j < c->argc; j++)
1554 decrRefCount(c->argv[j]);
e8a74421 1555 for (j = 0; j < c->mbargc; j++)
1556 decrRefCount(c->mbargv[j]);
ed9b544e 1557 c->argc = 0;
e8a74421 1558 c->mbargc = 0;
ed9b544e 1559}
1560
1561static void freeClient(redisClient *c) {
1562 listNode *ln;
1563
4409877e 1564 /* Note that if the client we are freeing is blocked into a blocking
1565 * call, we have to set querybuf to NULL *before* to call unblockClient()
1566 * to avoid processInputBuffer() will get called. Also it is important
1567 * to remove the file events after this, because this call adds
1568 * the READABLE event. */
1569 sdsfree(c->querybuf);
1570 c->querybuf = NULL;
1571 if (c->flags & REDIS_BLOCKED)
1572 unblockClient(c);
1573
ed9b544e 1574 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1575 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1576 listRelease(c->reply);
1577 freeClientArgv(c);
1578 close(c->fd);
1579 ln = listSearchKey(server.clients,c);
dfc5e96c 1580 redisAssert(ln != NULL);
ed9b544e 1581 listDelNode(server.clients,ln);
1582 if (c->flags & REDIS_SLAVE) {
6208b3a7 1583 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1584 close(c->repldbfd);
87eca727 1585 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1586 ln = listSearchKey(l,c);
dfc5e96c 1587 redisAssert(ln != NULL);
87eca727 1588 listDelNode(l,ln);
ed9b544e 1589 }
1590 if (c->flags & REDIS_MASTER) {
1591 server.master = NULL;
1592 server.replstate = REDIS_REPL_CONNECT;
1593 }
93ea3759 1594 zfree(c->argv);
e8a74421 1595 zfree(c->mbargv);
6e469882 1596 freeClientMultiState(c);
ed9b544e 1597 zfree(c);
1598}
1599
cc30e368 1600#define GLUEREPLY_UP_TO (1024)
ed9b544e 1601static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1602 int copylen = 0;
1603 char buf[GLUEREPLY_UP_TO];
6208b3a7 1604 listNode *ln;
ed9b544e 1605 robj *o;
1606
6208b3a7 1607 listRewind(c->reply);
1608 while((ln = listYield(c->reply))) {
c28b42ac 1609 int objlen;
1610
ed9b544e 1611 o = ln->value;
c28b42ac 1612 objlen = sdslen(o->ptr);
1613 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1614 memcpy(buf+copylen,o->ptr,objlen);
1615 copylen += objlen;
ed9b544e 1616 listDelNode(c->reply,ln);
c28b42ac 1617 } else {
1618 if (copylen == 0) return;
1619 break;
ed9b544e 1620 }
ed9b544e 1621 }
c28b42ac 1622 /* Now the output buffer is empty, add the new single element */
1623 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1624 listAddNodeHead(c->reply,o);
ed9b544e 1625}
1626
1627static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1628 redisClient *c = privdata;
1629 int nwritten = 0, totwritten = 0, objlen;
1630 robj *o;
1631 REDIS_NOTUSED(el);
1632 REDIS_NOTUSED(mask);
1633
2895e862 1634 /* Use writev() if we have enough buffers to send */
7ea870c0 1635 if (!server.glueoutputbuf &&
1636 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1637 !(c->flags & REDIS_MASTER))
2895e862 1638 {
1639 sendReplyToClientWritev(el, fd, privdata, mask);
1640 return;
1641 }
2895e862 1642
ed9b544e 1643 while(listLength(c->reply)) {
c28b42ac 1644 if (server.glueoutputbuf && listLength(c->reply) > 1)
1645 glueReplyBuffersIfNeeded(c);
1646
ed9b544e 1647 o = listNodeValue(listFirst(c->reply));
1648 objlen = sdslen(o->ptr);
1649
1650 if (objlen == 0) {
1651 listDelNode(c->reply,listFirst(c->reply));
1652 continue;
1653 }
1654
1655 if (c->flags & REDIS_MASTER) {
6f376729 1656 /* Don't reply to a master */
ed9b544e 1657 nwritten = objlen - c->sentlen;
1658 } else {
a4d1ba9a 1659 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 1660 if (nwritten <= 0) break;
1661 }
1662 c->sentlen += nwritten;
1663 totwritten += nwritten;
1664 /* If we fully sent the object on head go to the next one */
1665 if (c->sentlen == objlen) {
1666 listDelNode(c->reply,listFirst(c->reply));
1667 c->sentlen = 0;
1668 }
6f376729 1669 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 1670 * bytes, in a single threaded server it's a good idea to serve
6f376729 1671 * other clients as well, even if a very large request comes from
1672 * super fast link that is always able to accept data (in real world
12f9d551 1673 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 1674 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 1675 }
1676 if (nwritten == -1) {
1677 if (errno == EAGAIN) {
1678 nwritten = 0;
1679 } else {
1680 redisLog(REDIS_DEBUG,
1681 "Error writing to client: %s", strerror(errno));
1682 freeClient(c);
1683 return;
1684 }
1685 }
1686 if (totwritten > 0) c->lastinteraction = time(NULL);
1687 if (listLength(c->reply) == 0) {
1688 c->sentlen = 0;
1689 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1690 }
1691}
1692
2895e862 1693static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1694{
1695 redisClient *c = privdata;
1696 int nwritten = 0, totwritten = 0, objlen, willwrite;
1697 robj *o;
1698 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1699 int offset, ion = 0;
1700 REDIS_NOTUSED(el);
1701 REDIS_NOTUSED(mask);
1702
1703 listNode *node;
1704 while (listLength(c->reply)) {
1705 offset = c->sentlen;
1706 ion = 0;
1707 willwrite = 0;
1708
1709 /* fill-in the iov[] array */
1710 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1711 o = listNodeValue(node);
1712 objlen = sdslen(o->ptr);
1713
1714 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1715 break;
1716
1717 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1718 break; /* no more iovecs */
1719
1720 iov[ion].iov_base = ((char*)o->ptr) + offset;
1721 iov[ion].iov_len = objlen - offset;
1722 willwrite += objlen - offset;
1723 offset = 0; /* just for the first item */
1724 ion++;
1725 }
1726
1727 if(willwrite == 0)
1728 break;
1729
1730 /* write all collected blocks at once */
1731 if((nwritten = writev(fd, iov, ion)) < 0) {
1732 if (errno != EAGAIN) {
1733 redisLog(REDIS_DEBUG,
1734 "Error writing to client: %s", strerror(errno));
1735 freeClient(c);
1736 return;
1737 }
1738 break;
1739 }
1740
1741 totwritten += nwritten;
1742 offset = c->sentlen;
1743
1744 /* remove written robjs from c->reply */
1745 while (nwritten && listLength(c->reply)) {
1746 o = listNodeValue(listFirst(c->reply));
1747 objlen = sdslen(o->ptr);
1748
1749 if(nwritten >= objlen - offset) {
1750 listDelNode(c->reply, listFirst(c->reply));
1751 nwritten -= objlen - offset;
1752 c->sentlen = 0;
1753 } else {
1754 /* partial write */
1755 c->sentlen += nwritten;
1756 break;
1757 }
1758 offset = 0;
1759 }
1760 }
1761
1762 if (totwritten > 0)
1763 c->lastinteraction = time(NULL);
1764
1765 if (listLength(c->reply) == 0) {
1766 c->sentlen = 0;
1767 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1768 }
1769}
1770
ed9b544e 1771static struct redisCommand *lookupCommand(char *name) {
1772 int j = 0;
1773 while(cmdTable[j].name != NULL) {
bb0b03a3 1774 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 1775 j++;
1776 }
1777 return NULL;
1778}
1779
1780/* resetClient prepare the client to process the next command */
1781static void resetClient(redisClient *c) {
1782 freeClientArgv(c);
1783 c->bulklen = -1;
e8a74421 1784 c->multibulk = 0;
ed9b544e 1785}
1786
6e469882 1787/* Call() is the core of Redis execution of a command */
1788static void call(redisClient *c, struct redisCommand *cmd) {
1789 long long dirty;
1790
1791 dirty = server.dirty;
1792 cmd->proc(c);
1793 if (server.appendonly && server.dirty-dirty)
1794 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
1795 if (server.dirty-dirty && listLength(server.slaves))
1796 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
1797 if (listLength(server.monitors))
1798 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
1799 server.stat_numcommands++;
1800}
1801
ed9b544e 1802/* If this function gets called we already read a whole
1803 * command, argments are in the client argv/argc fields.
1804 * processCommand() execute the command or prepare the
1805 * server for a bulk read from the client.
1806 *
1807 * If 1 is returned the client is still alive and valid and
1808 * and other operations can be performed by the caller. Otherwise
1809 * if 0 is returned the client was destroied (i.e. after QUIT). */
1810static int processCommand(redisClient *c) {
1811 struct redisCommand *cmd;
ed9b544e 1812
3fd78bcd 1813 /* Free some memory if needed (maxmemory setting) */
1814 if (server.maxmemory) freeMemoryIfNeeded();
1815
e8a74421 1816 /* Handle the multi bulk command type. This is an alternative protocol
1817 * supported by Redis in order to receive commands that are composed of
1818 * multiple binary-safe "bulk" arguments. The latency of processing is
1819 * a bit higher but this allows things like multi-sets, so if this
1820 * protocol is used only for MSET and similar commands this is a big win. */
1821 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
1822 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
1823 if (c->multibulk <= 0) {
1824 resetClient(c);
1825 return 1;
1826 } else {
1827 decrRefCount(c->argv[c->argc-1]);
1828 c->argc--;
1829 return 1;
1830 }
1831 } else if (c->multibulk) {
1832 if (c->bulklen == -1) {
1833 if (((char*)c->argv[0]->ptr)[0] != '$') {
1834 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
1835 resetClient(c);
1836 return 1;
1837 } else {
1838 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
1839 decrRefCount(c->argv[0]);
1840 if (bulklen < 0 || bulklen > 1024*1024*1024) {
1841 c->argc--;
1842 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
1843 resetClient(c);
1844 return 1;
1845 }
1846 c->argc--;
1847 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
1848 return 1;
1849 }
1850 } else {
1851 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
1852 c->mbargv[c->mbargc] = c->argv[0];
1853 c->mbargc++;
1854 c->argc--;
1855 c->multibulk--;
1856 if (c->multibulk == 0) {
1857 robj **auxargv;
1858 int auxargc;
1859
1860 /* Here we need to swap the multi-bulk argc/argv with the
1861 * normal argc/argv of the client structure. */
1862 auxargv = c->argv;
1863 c->argv = c->mbargv;
1864 c->mbargv = auxargv;
1865
1866 auxargc = c->argc;
1867 c->argc = c->mbargc;
1868 c->mbargc = auxargc;
1869
1870 /* We need to set bulklen to something different than -1
1871 * in order for the code below to process the command without
1872 * to try to read the last argument of a bulk command as
1873 * a special argument. */
1874 c->bulklen = 0;
1875 /* continue below and process the command */
1876 } else {
1877 c->bulklen = -1;
1878 return 1;
1879 }
1880 }
1881 }
1882 /* -- end of multi bulk commands processing -- */
1883
ed9b544e 1884 /* The QUIT command is handled as a special case. Normal command
1885 * procs are unable to close the client connection safely */
bb0b03a3 1886 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 1887 freeClient(c);
1888 return 0;
1889 }
1890 cmd = lookupCommand(c->argv[0]->ptr);
1891 if (!cmd) {
2c14807b 1892 addReplySds(c,
1893 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
1894 (char*)c->argv[0]->ptr));
ed9b544e 1895 resetClient(c);
1896 return 1;
1897 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
1898 (c->argc < -cmd->arity)) {
454d4e43 1899 addReplySds(c,
1900 sdscatprintf(sdsempty(),
1901 "-ERR wrong number of arguments for '%s' command\r\n",
1902 cmd->name));
ed9b544e 1903 resetClient(c);
1904 return 1;
3fd78bcd 1905 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
1906 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
1907 resetClient(c);
1908 return 1;
ed9b544e 1909 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
1910 int bulklen = atoi(c->argv[c->argc-1]->ptr);
1911
1912 decrRefCount(c->argv[c->argc-1]);
1913 if (bulklen < 0 || bulklen > 1024*1024*1024) {
1914 c->argc--;
1915 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
1916 resetClient(c);
1917 return 1;
1918 }
1919 c->argc--;
1920 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
1921 /* It is possible that the bulk read is already in the
8d0490e7 1922 * buffer. Check this condition and handle it accordingly.
1923 * This is just a fast path, alternative to call processInputBuffer().
1924 * It's a good idea since the code is small and this condition
1925 * happens most of the times. */
ed9b544e 1926 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
1927 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
1928 c->argc++;
1929 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
1930 } else {
1931 return 1;
1932 }
1933 }
10c43610 1934 /* Let's try to share objects on the command arguments vector */
1935 if (server.shareobjects) {
1936 int j;
1937 for(j = 1; j < c->argc; j++)
1938 c->argv[j] = tryObjectSharing(c->argv[j]);
1939 }
942a3961 1940 /* Let's try to encode the bulk object to save space. */
1941 if (cmd->flags & REDIS_CMD_BULK)
1942 tryObjectEncoding(c->argv[c->argc-1]);
1943
e63943a4 1944 /* Check if the user is authenticated */
1945 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
1946 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
1947 resetClient(c);
1948 return 1;
1949 }
1950
ed9b544e 1951 /* Exec the command */
6e469882 1952 if (c->flags & REDIS_MULTI && cmd->proc != execCommand) {
1953 queueMultiCommand(c,cmd);
1954 addReply(c,shared.queued);
1955 } else {
1956 call(c,cmd);
1957 }
ed9b544e 1958
1959 /* Prepare the client for the next command */
1960 if (c->flags & REDIS_CLOSE) {
1961 freeClient(c);
1962 return 0;
1963 }
1964 resetClient(c);
1965 return 1;
1966}
1967
87eca727 1968static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6208b3a7 1969 listNode *ln;
ed9b544e 1970 int outc = 0, j;
93ea3759 1971 robj **outv;
1972 /* (args*2)+1 is enough room for args, spaces, newlines */
1973 robj *static_outv[REDIS_STATIC_ARGS*2+1];
1974
1975 if (argc <= REDIS_STATIC_ARGS) {
1976 outv = static_outv;
1977 } else {
1978 outv = zmalloc(sizeof(robj*)*(argc*2+1));
93ea3759 1979 }
ed9b544e 1980
1981 for (j = 0; j < argc; j++) {
1982 if (j != 0) outv[outc++] = shared.space;
1983 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
1984 robj *lenobj;
1985
1986 lenobj = createObject(REDIS_STRING,
682ac724 1987 sdscatprintf(sdsempty(),"%lu\r\n",
83c6a618 1988 (unsigned long) stringObjectLen(argv[j])));
ed9b544e 1989 lenobj->refcount = 0;
1990 outv[outc++] = lenobj;
1991 }
1992 outv[outc++] = argv[j];
1993 }
1994 outv[outc++] = shared.crlf;
1995
40d224a9 1996 /* Increment all the refcounts at start and decrement at end in order to
1997 * be sure to free objects if there is no slave in a replication state
1998 * able to be feed with commands */
1999 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
6208b3a7 2000 listRewind(slaves);
2001 while((ln = listYield(slaves))) {
ed9b544e 2002 redisClient *slave = ln->value;
40d224a9 2003
2004 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2005 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2006
2007 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2008 if (slave->slaveseldb != dictid) {
2009 robj *selectcmd;
2010
2011 switch(dictid) {
2012 case 0: selectcmd = shared.select0; break;
2013 case 1: selectcmd = shared.select1; break;
2014 case 2: selectcmd = shared.select2; break;
2015 case 3: selectcmd = shared.select3; break;
2016 case 4: selectcmd = shared.select4; break;
2017 case 5: selectcmd = shared.select5; break;
2018 case 6: selectcmd = shared.select6; break;
2019 case 7: selectcmd = shared.select7; break;
2020 case 8: selectcmd = shared.select8; break;
2021 case 9: selectcmd = shared.select9; break;
2022 default:
2023 selectcmd = createObject(REDIS_STRING,
2024 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2025 selectcmd->refcount = 0;
2026 break;
2027 }
2028 addReply(slave,selectcmd);
2029 slave->slaveseldb = dictid;
2030 }
2031 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2032 }
40d224a9 2033 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2034 if (outv != static_outv) zfree(outv);
ed9b544e 2035}
2036
638e42ac 2037static void processInputBuffer(redisClient *c) {
ed9b544e 2038again:
4409877e 2039 /* Before to process the input buffer, make sure the client is not
2040 * waitig for a blocking operation such as BLPOP. Note that the first
2041 * iteration the client is never blocked, otherwise the processInputBuffer
2042 * would not be called at all, but after the execution of the first commands
2043 * in the input buffer the client may be blocked, and the "goto again"
2044 * will try to reiterate. The following line will make it return asap. */
2045 if (c->flags & REDIS_BLOCKED) return;
ed9b544e 2046 if (c->bulklen == -1) {
2047 /* Read the first line of the query */
2048 char *p = strchr(c->querybuf,'\n');
2049 size_t querylen;
644fafa3 2050
ed9b544e 2051 if (p) {
2052 sds query, *argv;
2053 int argc, j;
2054
2055 query = c->querybuf;
2056 c->querybuf = sdsempty();
2057 querylen = 1+(p-(query));
2058 if (sdslen(query) > querylen) {
2059 /* leave data after the first line of the query in the buffer */
2060 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2061 }
2062 *p = '\0'; /* remove "\n" */
2063 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2064 sdsupdatelen(query);
2065
2066 /* Now we can split the query in arguments */
ed9b544e 2067 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2068 sdsfree(query);
2069
2070 if (c->argv) zfree(c->argv);
2071 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2072
2073 for (j = 0; j < argc; j++) {
ed9b544e 2074 if (sdslen(argv[j])) {
2075 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2076 c->argc++;
2077 } else {
2078 sdsfree(argv[j]);
2079 }
2080 }
2081 zfree(argv);
7c49733c 2082 if (c->argc) {
2083 /* Execute the command. If the client is still valid
2084 * after processCommand() return and there is something
2085 * on the query buffer try to process the next command. */
2086 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2087 } else {
2088 /* Nothing to process, argc == 0. Just process the query
2089 * buffer if it's not empty or return to the caller */
2090 if (sdslen(c->querybuf)) goto again;
2091 }
ed9b544e 2092 return;
644fafa3 2093 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
ed9b544e 2094 redisLog(REDIS_DEBUG, "Client protocol error");
2095 freeClient(c);
2096 return;
2097 }
2098 } else {
2099 /* Bulk read handling. Note that if we are at this point
2100 the client already sent a command terminated with a newline,
2101 we are reading the bulk data that is actually the last
2102 argument of the command. */
2103 int qbl = sdslen(c->querybuf);
2104
2105 if (c->bulklen <= qbl) {
2106 /* Copy everything but the final CRLF as final argument */
2107 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2108 c->argc++;
2109 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2110 /* Process the command. If the client is still valid after
2111 * the processing and there is more data in the buffer
2112 * try to parse it. */
2113 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2114 return;
2115 }
2116 }
2117}
2118
638e42ac 2119static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2120 redisClient *c = (redisClient*) privdata;
2121 char buf[REDIS_IOBUF_LEN];
2122 int nread;
2123 REDIS_NOTUSED(el);
2124 REDIS_NOTUSED(mask);
2125
2126 nread = read(fd, buf, REDIS_IOBUF_LEN);
2127 if (nread == -1) {
2128 if (errno == EAGAIN) {
2129 nread = 0;
2130 } else {
2131 redisLog(REDIS_DEBUG, "Reading from client: %s",strerror(errno));
2132 freeClient(c);
2133 return;
2134 }
2135 } else if (nread == 0) {
2136 redisLog(REDIS_DEBUG, "Client closed connection");
2137 freeClient(c);
2138 return;
2139 }
2140 if (nread) {
2141 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2142 c->lastinteraction = time(NULL);
2143 } else {
2144 return;
2145 }
2146 processInputBuffer(c);
2147}
2148
ed9b544e 2149static int selectDb(redisClient *c, int id) {
2150 if (id < 0 || id >= server.dbnum)
2151 return REDIS_ERR;
3305306f 2152 c->db = &server.db[id];
ed9b544e 2153 return REDIS_OK;
2154}
2155
40d224a9 2156static void *dupClientReplyValue(void *o) {
2157 incrRefCount((robj*)o);
2158 return 0;
2159}
2160
ed9b544e 2161static redisClient *createClient(int fd) {
2162 redisClient *c = zmalloc(sizeof(*c));
2163
2164 anetNonBlock(NULL,fd);
2165 anetTcpNoDelay(NULL,fd);
2166 if (!c) return NULL;
2167 selectDb(c,0);
2168 c->fd = fd;
2169 c->querybuf = sdsempty();
2170 c->argc = 0;
93ea3759 2171 c->argv = NULL;
ed9b544e 2172 c->bulklen = -1;
e8a74421 2173 c->multibulk = 0;
2174 c->mbargc = 0;
2175 c->mbargv = NULL;
ed9b544e 2176 c->sentlen = 0;
2177 c->flags = 0;
2178 c->lastinteraction = time(NULL);
abcb223e 2179 c->authenticated = 0;
40d224a9 2180 c->replstate = REDIS_REPL_NONE;
6b47e12e 2181 c->reply = listCreate();
b177fd30 2182 c->blockingkeys = NULL;
2183 c->blockingkeysnum = 0;
ed9b544e 2184 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2185 listSetDupMethod(c->reply,dupClientReplyValue);
ed9b544e 2186 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2187 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2188 freeClient(c);
2189 return NULL;
2190 }
6b47e12e 2191 listAddNodeTail(server.clients,c);
6e469882 2192 initClientMultiState(c);
ed9b544e 2193 return c;
2194}
2195
2196static void addReply(redisClient *c, robj *obj) {
2197 if (listLength(c->reply) == 0 &&
6208b3a7 2198 (c->replstate == REDIS_REPL_NONE ||
2199 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2200 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2201 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2202
2203 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2204 obj = dupStringObject(obj);
2205 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2206 }
9d65a1bb 2207 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2208}
2209
2210static void addReplySds(redisClient *c, sds s) {
2211 robj *o = createObject(REDIS_STRING,s);
2212 addReply(c,o);
2213 decrRefCount(o);
2214}
2215
e2665397 2216static void addReplyDouble(redisClient *c, double d) {
2217 char buf[128];
2218
2219 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2220 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2221 (unsigned long) strlen(buf),buf));
e2665397 2222}
2223
942a3961 2224static void addReplyBulkLen(redisClient *c, robj *obj) {
2225 size_t len;
2226
2227 if (obj->encoding == REDIS_ENCODING_RAW) {
2228 len = sdslen(obj->ptr);
2229 } else {
2230 long n = (long)obj->ptr;
2231
e054afda 2232 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2233 len = 1;
2234 if (n < 0) {
2235 len++;
2236 n = -n;
2237 }
2238 while((n = n/10) != 0) {
2239 len++;
2240 }
2241 }
83c6a618 2242 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2243}
2244
ed9b544e 2245static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2246 int cport, cfd;
2247 char cip[128];
285add55 2248 redisClient *c;
ed9b544e 2249 REDIS_NOTUSED(el);
2250 REDIS_NOTUSED(mask);
2251 REDIS_NOTUSED(privdata);
2252
2253 cfd = anetAccept(server.neterr, fd, cip, &cport);
2254 if (cfd == AE_ERR) {
2255 redisLog(REDIS_DEBUG,"Accepting client connection: %s", server.neterr);
2256 return;
2257 }
2258 redisLog(REDIS_DEBUG,"Accepted %s:%d", cip, cport);
285add55 2259 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2260 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2261 close(cfd); /* May be already closed, just ingore errors */
2262 return;
2263 }
285add55 2264 /* If maxclient directive is set and this is one client more... close the
2265 * connection. Note that we create the client instead to check before
2266 * for this condition, since now the socket is already set in nonblocking
2267 * mode and we can send an error for free using the Kernel I/O */
2268 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2269 char *err = "-ERR max number of clients reached\r\n";
2270
2271 /* That's a best effort error message, don't check write errors */
fee803ba 2272 if (write(c->fd,err,strlen(err)) == -1) {
2273 /* Nothing to do, Just to avoid the warning... */
2274 }
285add55 2275 freeClient(c);
2276 return;
2277 }
ed9b544e 2278 server.stat_numconnections++;
2279}
2280
2281/* ======================= Redis objects implementation ===================== */
2282
2283static robj *createObject(int type, void *ptr) {
2284 robj *o;
2285
2286 if (listLength(server.objfreelist)) {
2287 listNode *head = listFirst(server.objfreelist);
2288 o = listNodeValue(head);
2289 listDelNode(server.objfreelist,head);
2290 } else {
75680a3c 2291 if (server.vm_enabled) {
2292 o = zmalloc(sizeof(*o));
2293 } else {
2294 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2295 }
ed9b544e 2296 }
ed9b544e 2297 o->type = type;
942a3961 2298 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2299 o->ptr = ptr;
2300 o->refcount = 1;
3a66edc7 2301 if (server.vm_enabled) {
2302 o->vm.atime = server.unixtime;
2303 o->storage = REDIS_VM_MEMORY;
2304 }
ed9b544e 2305 return o;
2306}
2307
2308static robj *createStringObject(char *ptr, size_t len) {
2309 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2310}
2311
4ef8de8a 2312static robj *dupStringObject(robj *o) {
2313 return createStringObject(o->ptr,sdslen(o->ptr));
2314}
2315
ed9b544e 2316static robj *createListObject(void) {
2317 list *l = listCreate();
2318
ed9b544e 2319 listSetFreeMethod(l,decrRefCount);
2320 return createObject(REDIS_LIST,l);
2321}
2322
2323static robj *createSetObject(void) {
2324 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2325 return createObject(REDIS_SET,d);
2326}
2327
1812e024 2328static robj *createZsetObject(void) {
6b47e12e 2329 zset *zs = zmalloc(sizeof(*zs));
2330
2331 zs->dict = dictCreate(&zsetDictType,NULL);
2332 zs->zsl = zslCreate();
2333 return createObject(REDIS_ZSET,zs);
1812e024 2334}
2335
ed9b544e 2336static void freeStringObject(robj *o) {
942a3961 2337 if (o->encoding == REDIS_ENCODING_RAW) {
2338 sdsfree(o->ptr);
2339 }
ed9b544e 2340}
2341
2342static void freeListObject(robj *o) {
2343 listRelease((list*) o->ptr);
2344}
2345
2346static void freeSetObject(robj *o) {
2347 dictRelease((dict*) o->ptr);
2348}
2349
fd8ccf44 2350static void freeZsetObject(robj *o) {
2351 zset *zs = o->ptr;
2352
2353 dictRelease(zs->dict);
2354 zslFree(zs->zsl);
2355 zfree(zs);
2356}
2357
ed9b544e 2358static void freeHashObject(robj *o) {
2359 dictRelease((dict*) o->ptr);
2360}
2361
2362static void incrRefCount(robj *o) {
a35ddf12 2363 assert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
ed9b544e 2364 o->refcount++;
2365}
2366
2367static void decrRefCount(void *obj) {
2368 robj *o = obj;
94754ccc 2369
a35ddf12 2370 /* REDIS_VM_SWAPPED */
2371 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPED) {
2372 assert(o->refcount == 1);
2373 assert(o->type == REDIS_STRING);
2374 freeStringObject(o);
2375 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2376 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2377 !listAddNodeHead(server.objfreelist,o))
2378 zfree(o);
2379 return;
2380 }
2381 /* REDIS_VM_MEMORY */
ed9b544e 2382 if (--(o->refcount) == 0) {
2383 switch(o->type) {
2384 case REDIS_STRING: freeStringObject(o); break;
2385 case REDIS_LIST: freeListObject(o); break;
2386 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2387 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2388 case REDIS_HASH: freeHashObject(o); break;
dfc5e96c 2389 default: redisAssert(0 != 0); break;
ed9b544e 2390 }
2391 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2392 !listAddNodeHead(server.objfreelist,o))
2393 zfree(o);
2394 }
2395}
2396
942a3961 2397static robj *lookupKey(redisDb *db, robj *key) {
2398 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2399 if (de) {
55cf8433 2400 robj *key = dictGetEntryKey(de);
2401 robj *val = dictGetEntryVal(de);
3a66edc7 2402
55cf8433 2403 if (server.vm_enabled) {
2404 if (key->storage == REDIS_VM_MEMORY) {
2405 /* Update the access time of the key for the aging algorithm. */
2406 key->vm.atime = server.unixtime;
2407 } else {
2408 /* Our value was swapped on disk. Bring it at home. */
2409 assert(val == NULL);
2410 val = vmLoadObject(key);
2411 dictGetEntryVal(de) = val;
2412 }
2413 }
2414 return val;
3a66edc7 2415 } else {
2416 return NULL;
2417 }
942a3961 2418}
2419
2420static robj *lookupKeyRead(redisDb *db, robj *key) {
2421 expireIfNeeded(db,key);
2422 return lookupKey(db,key);
2423}
2424
2425static robj *lookupKeyWrite(redisDb *db, robj *key) {
2426 deleteIfVolatile(db,key);
2427 return lookupKey(db,key);
2428}
2429
2430static int deleteKey(redisDb *db, robj *key) {
2431 int retval;
2432
2433 /* We need to protect key from destruction: after the first dictDelete()
2434 * it may happen that 'key' is no longer valid if we don't increment
2435 * it's count. This may happen when we get the object reference directly
2436 * from the hash table with dictRandomKey() or dict iterators */
2437 incrRefCount(key);
2438 if (dictSize(db->expires)) dictDelete(db->expires,key);
2439 retval = dictDelete(db->dict,key);
2440 decrRefCount(key);
2441
2442 return retval == DICT_OK;
2443}
2444
10c43610 2445/* Try to share an object against the shared objects pool */
2446static robj *tryObjectSharing(robj *o) {
2447 struct dictEntry *de;
2448 unsigned long c;
2449
3305306f 2450 if (o == NULL || server.shareobjects == 0) return o;
10c43610 2451
dfc5e96c 2452 redisAssert(o->type == REDIS_STRING);
10c43610 2453 de = dictFind(server.sharingpool,o);
2454 if (de) {
2455 robj *shared = dictGetEntryKey(de);
2456
2457 c = ((unsigned long) dictGetEntryVal(de))+1;
2458 dictGetEntryVal(de) = (void*) c;
2459 incrRefCount(shared);
2460 decrRefCount(o);
2461 return shared;
2462 } else {
2463 /* Here we are using a stream algorihtm: Every time an object is
2464 * shared we increment its count, everytime there is a miss we
2465 * recrement the counter of a random object. If this object reaches
2466 * zero we remove the object and put the current object instead. */
3305306f 2467 if (dictSize(server.sharingpool) >=
10c43610 2468 server.sharingpoolsize) {
2469 de = dictGetRandomKey(server.sharingpool);
dfc5e96c 2470 redisAssert(de != NULL);
10c43610 2471 c = ((unsigned long) dictGetEntryVal(de))-1;
2472 dictGetEntryVal(de) = (void*) c;
2473 if (c == 0) {
2474 dictDelete(server.sharingpool,de->key);
2475 }
2476 } else {
2477 c = 0; /* If the pool is empty we want to add this object */
2478 }
2479 if (c == 0) {
2480 int retval;
2481
2482 retval = dictAdd(server.sharingpool,o,(void*)1);
dfc5e96c 2483 redisAssert(retval == DICT_OK);
10c43610 2484 incrRefCount(o);
2485 }
2486 return o;
2487 }
2488}
2489
724a51b1 2490/* Check if the nul-terminated string 's' can be represented by a long
2491 * (that is, is a number that fits into long without any other space or
2492 * character before or after the digits).
2493 *
2494 * If so, the function returns REDIS_OK and *longval is set to the value
2495 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2496static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2497 char buf[32], *endptr;
2498 long value;
2499 int slen;
2500
2501 value = strtol(s, &endptr, 10);
2502 if (endptr[0] != '\0') return REDIS_ERR;
2503 slen = snprintf(buf,32,"%ld",value);
2504
2505 /* If the number converted back into a string is not identical
2506 * then it's not possible to encode the string as integer */
f69f2cba 2507 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2508 if (longval) *longval = value;
2509 return REDIS_OK;
2510}
2511
942a3961 2512/* Try to encode a string object in order to save space */
2513static int tryObjectEncoding(robj *o) {
2514 long value;
942a3961 2515 sds s = o->ptr;
3305306f 2516
942a3961 2517 if (o->encoding != REDIS_ENCODING_RAW)
2518 return REDIS_ERR; /* Already encoded */
3305306f 2519
942a3961 2520 /* It's not save to encode shared objects: shared objects can be shared
2521 * everywhere in the "object space" of Redis. Encoded objects can only
2522 * appear as "values" (and not, for instance, as keys) */
2523 if (o->refcount > 1) return REDIS_ERR;
3305306f 2524
942a3961 2525 /* Currently we try to encode only strings */
dfc5e96c 2526 redisAssert(o->type == REDIS_STRING);
94754ccc 2527
724a51b1 2528 /* Check if we can represent this string as a long integer */
2529 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
942a3961 2530
2531 /* Ok, this object can be encoded */
2532 o->encoding = REDIS_ENCODING_INT;
2533 sdsfree(o->ptr);
2534 o->ptr = (void*) value;
2535 return REDIS_OK;
2536}
2537
9d65a1bb 2538/* Get a decoded version of an encoded object (returned as a new object).
2539 * If the object is already raw-encoded just increment the ref count. */
2540static robj *getDecodedObject(robj *o) {
942a3961 2541 robj *dec;
2542
9d65a1bb 2543 if (o->encoding == REDIS_ENCODING_RAW) {
2544 incrRefCount(o);
2545 return o;
2546 }
942a3961 2547 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2548 char buf[32];
2549
2550 snprintf(buf,32,"%ld",(long)o->ptr);
2551 dec = createStringObject(buf,strlen(buf));
2552 return dec;
2553 } else {
dfc5e96c 2554 redisAssert(1 != 1);
942a3961 2555 }
3305306f 2556}
2557
d7f43c08 2558/* Compare two string objects via strcmp() or alike.
2559 * Note that the objects may be integer-encoded. In such a case we
2560 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 2561 * and compare the strings, it's much faster than calling getDecodedObject().
2562 *
2563 * Important note: if objects are not integer encoded, but binary-safe strings,
2564 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2565 * binary safe. */
724a51b1 2566static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 2567 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 2568 char bufa[128], bufb[128], *astr, *bstr;
2569 int bothsds = 1;
724a51b1 2570
e197b441 2571 if (a == b) return 0;
d7f43c08 2572 if (a->encoding != REDIS_ENCODING_RAW) {
2573 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2574 astr = bufa;
2575 bothsds = 0;
724a51b1 2576 } else {
d7f43c08 2577 astr = a->ptr;
724a51b1 2578 }
d7f43c08 2579 if (b->encoding != REDIS_ENCODING_RAW) {
2580 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2581 bstr = bufb;
2582 bothsds = 0;
2583 } else {
2584 bstr = b->ptr;
2585 }
2586 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 2587}
2588
0ea663ea 2589static size_t stringObjectLen(robj *o) {
dfc5e96c 2590 redisAssert(o->type == REDIS_STRING);
0ea663ea 2591 if (o->encoding == REDIS_ENCODING_RAW) {
2592 return sdslen(o->ptr);
2593 } else {
2594 char buf[32];
2595
2596 return snprintf(buf,32,"%ld",(long)o->ptr);
2597 }
2598}
2599
06233c45 2600/*============================ RDB saving/loading =========================== */
ed9b544e 2601
f78fd11b 2602static int rdbSaveType(FILE *fp, unsigned char type) {
2603 if (fwrite(&type,1,1,fp) == 0) return -1;
2604 return 0;
2605}
2606
bb32ede5 2607static int rdbSaveTime(FILE *fp, time_t t) {
2608 int32_t t32 = (int32_t) t;
2609 if (fwrite(&t32,4,1,fp) == 0) return -1;
2610 return 0;
2611}
2612
e3566d4b 2613/* check rdbLoadLen() comments for more info */
f78fd11b 2614static int rdbSaveLen(FILE *fp, uint32_t len) {
2615 unsigned char buf[2];
2616
2617 if (len < (1<<6)) {
2618 /* Save a 6 bit len */
10c43610 2619 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 2620 if (fwrite(buf,1,1,fp) == 0) return -1;
2621 } else if (len < (1<<14)) {
2622 /* Save a 14 bit len */
10c43610 2623 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 2624 buf[1] = len&0xFF;
17be1a4a 2625 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 2626 } else {
2627 /* Save a 32 bit len */
10c43610 2628 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 2629 if (fwrite(buf,1,1,fp) == 0) return -1;
2630 len = htonl(len);
2631 if (fwrite(&len,4,1,fp) == 0) return -1;
2632 }
2633 return 0;
2634}
2635
e3566d4b 2636/* String objects in the form "2391" "-100" without any space and with a
2637 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2638 * encoded as integers to save space */
56906eef 2639static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
e3566d4b 2640 long long value;
2641 char *endptr, buf[32];
2642
2643 /* Check if it's possible to encode this value as a number */
2644 value = strtoll(s, &endptr, 10);
2645 if (endptr[0] != '\0') return 0;
2646 snprintf(buf,32,"%lld",value);
2647
2648 /* If the number converted back into a string is not identical
2649 * then it's not possible to encode the string as integer */
2650 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2651
2652 /* Finally check if it fits in our ranges */
2653 if (value >= -(1<<7) && value <= (1<<7)-1) {
2654 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2655 enc[1] = value&0xFF;
2656 return 2;
2657 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2658 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2659 enc[1] = value&0xFF;
2660 enc[2] = (value>>8)&0xFF;
2661 return 3;
2662 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2663 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2664 enc[1] = value&0xFF;
2665 enc[2] = (value>>8)&0xFF;
2666 enc[3] = (value>>16)&0xFF;
2667 enc[4] = (value>>24)&0xFF;
2668 return 5;
2669 } else {
2670 return 0;
2671 }
2672}
2673
774e3047 2674static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2675 unsigned int comprlen, outlen;
2676 unsigned char byte;
2677 void *out;
2678
2679 /* We require at least four bytes compression for this to be worth it */
2680 outlen = sdslen(obj->ptr)-4;
2681 if (outlen <= 0) return 0;
3a2694c4 2682 if ((out = zmalloc(outlen+1)) == NULL) return 0;
774e3047 2683 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2684 if (comprlen == 0) {
88e85998 2685 zfree(out);
774e3047 2686 return 0;
2687 }
2688 /* Data compressed! Let's save it on disk */
2689 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2690 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2691 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2692 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2693 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 2694 zfree(out);
774e3047 2695 return comprlen;
2696
2697writeerr:
88e85998 2698 zfree(out);
774e3047 2699 return -1;
2700}
2701
e3566d4b 2702/* Save a string objet as [len][data] on disk. If the object is a string
2703 * representation of an integer value we try to safe it in a special form */
942a3961 2704static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
2705 size_t len;
e3566d4b 2706 int enclen;
10c43610 2707
942a3961 2708 len = sdslen(obj->ptr);
2709
774e3047 2710 /* Try integer encoding */
e3566d4b 2711 if (len <= 11) {
2712 unsigned char buf[5];
2713 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
2714 if (fwrite(buf,enclen,1,fp) == 0) return -1;
2715 return 0;
2716 }
2717 }
774e3047 2718
2719 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 2720 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 2721 if (server.rdbcompression && len > 20) {
774e3047 2722 int retval;
2723
2724 retval = rdbSaveLzfStringObject(fp,obj);
2725 if (retval == -1) return -1;
2726 if (retval > 0) return 0;
2727 /* retval == 0 means data can't be compressed, save the old way */
2728 }
2729
2730 /* Store verbatim */
10c43610 2731 if (rdbSaveLen(fp,len) == -1) return -1;
2732 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
2733 return 0;
2734}
2735
942a3961 2736/* Like rdbSaveStringObjectRaw() but handle encoded objects */
2737static int rdbSaveStringObject(FILE *fp, robj *obj) {
2738 int retval;
942a3961 2739
9d65a1bb 2740 obj = getDecodedObject(obj);
2741 retval = rdbSaveStringObjectRaw(fp,obj);
2742 decrRefCount(obj);
2743 return retval;
942a3961 2744}
2745
a7866db6 2746/* Save a double value. Doubles are saved as strings prefixed by an unsigned
2747 * 8 bit integer specifing the length of the representation.
2748 * This 8 bit integer has special values in order to specify the following
2749 * conditions:
2750 * 253: not a number
2751 * 254: + inf
2752 * 255: - inf
2753 */
2754static int rdbSaveDoubleValue(FILE *fp, double val) {
2755 unsigned char buf[128];
2756 int len;
2757
2758 if (isnan(val)) {
2759 buf[0] = 253;
2760 len = 1;
2761 } else if (!isfinite(val)) {
2762 len = 1;
2763 buf[0] = (val < 0) ? 255 : 254;
2764 } else {
eaa256ad 2765 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 2766 buf[0] = strlen((char*)buf+1);
a7866db6 2767 len = buf[0]+1;
2768 }
2769 if (fwrite(buf,len,1,fp) == 0) return -1;
2770 return 0;
2771}
2772
06233c45 2773/* Save a Redis object. */
2774static int rdbSaveObject(FILE *fp, robj *o) {
2775 if (o->type == REDIS_STRING) {
2776 /* Save a string value */
2777 if (rdbSaveStringObject(fp,o) == -1) return -1;
2778 } else if (o->type == REDIS_LIST) {
2779 /* Save a list value */
2780 list *list = o->ptr;
2781 listNode *ln;
2782
2783 listRewind(list);
2784 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
2785 while((ln = listYield(list))) {
2786 robj *eleobj = listNodeValue(ln);
2787
2788 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
2789 }
2790 } else if (o->type == REDIS_SET) {
2791 /* Save a set value */
2792 dict *set = o->ptr;
2793 dictIterator *di = dictGetIterator(set);
2794 dictEntry *de;
2795
2796 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
2797 while((de = dictNext(di)) != NULL) {
2798 robj *eleobj = dictGetEntryKey(de);
2799
2800 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
2801 }
2802 dictReleaseIterator(di);
2803 } else if (o->type == REDIS_ZSET) {
2804 /* Save a set value */
2805 zset *zs = o->ptr;
2806 dictIterator *di = dictGetIterator(zs->dict);
2807 dictEntry *de;
2808
2809 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
2810 while((de = dictNext(di)) != NULL) {
2811 robj *eleobj = dictGetEntryKey(de);
2812 double *score = dictGetEntryVal(de);
2813
2814 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
2815 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
2816 }
2817 dictReleaseIterator(di);
2818 } else {
2819 redisAssert(0 != 0);
2820 }
2821 return 0;
2822}
2823
2824/* Return the length the object will have on disk if saved with
2825 * the rdbSaveObject() function. Currently we use a trick to get
2826 * this length with very little changes to the code. In the future
2827 * we could switch to a faster solution. */
2828static off_t rdbSavedObjectLen(robj *o) {
2829 static FILE *fp = NULL;
2830
2831 if (fp == NULL) fp = fopen("/dev/null","w");
2832 assert(fp != NULL);
2833
2834 rewind(fp);
2835 assert(rdbSaveObject(fp,o) != 1);
2836 return ftello(fp);
2837}
2838
06224fec 2839/* Return the number of pages required to save this object in the swap file */
2840static off_t rdbSavedObjectPages(robj *o) {
2841 off_t bytes = rdbSavedObjectLen(o);
2842
2843 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
2844}
2845
ed9b544e 2846/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 2847static int rdbSave(char *filename) {
ed9b544e 2848 dictIterator *di = NULL;
2849 dictEntry *de;
ed9b544e 2850 FILE *fp;
2851 char tmpfile[256];
2852 int j;
bb32ede5 2853 time_t now = time(NULL);
ed9b544e 2854
a3b21203 2855 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 2856 fp = fopen(tmpfile,"w");
2857 if (!fp) {
2858 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
2859 return REDIS_ERR;
2860 }
f78fd11b 2861 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 2862 for (j = 0; j < server.dbnum; j++) {
bb32ede5 2863 redisDb *db = server.db+j;
2864 dict *d = db->dict;
3305306f 2865 if (dictSize(d) == 0) continue;
ed9b544e 2866 di = dictGetIterator(d);
2867 if (!di) {
2868 fclose(fp);
2869 return REDIS_ERR;
2870 }
2871
2872 /* Write the SELECT DB opcode */
f78fd11b 2873 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
2874 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 2875
2876 /* Iterate this DB writing every entry */
2877 while((de = dictNext(di)) != NULL) {
2878 robj *key = dictGetEntryKey(de);
2879 robj *o = dictGetEntryVal(de);
bb32ede5 2880 time_t expiretime = getExpire(db,key);
2881
2882 /* Save the expire time */
2883 if (expiretime != -1) {
2884 /* If this key is already expired skip it */
2885 if (expiretime < now) continue;
2886 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
2887 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
2888 }
7e69548d 2889 /* Save the key and associated value. This requires special
2890 * handling if the value is swapped out. */
2891 if (key->storage == REDIS_VM_MEMORY) {
2892 /* Save type, key, value */
2893 if (rdbSaveType(fp,o->type) == -1) goto werr;
2894 if (rdbSaveStringObject(fp,key) == -1) goto werr;
2895 if (rdbSaveObject(fp,o) == -1) goto werr;
2896 } else {
2897 robj *po, *newkey;
2898 /* Get a preview of the object in memory */
2899 po = vmPreviewObject(key);
2900 /* Also duplicate the key object, to pass around a standard
2901 * string object. */
2902 newkey = dupStringObject(key);
2903 /* Save type, key, value */
2904 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
2905 if (rdbSaveStringObject(fp,newkey) == -1) goto werr;
2906 if (rdbSaveObject(fp,po) == -1) goto werr;
2907 /* Remove the loaded object from memory */
2908 decrRefCount(po);
2909 decrRefCount(newkey);
2910 }
ed9b544e 2911 }
2912 dictReleaseIterator(di);
2913 }
2914 /* EOF opcode */
f78fd11b 2915 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
2916
2917 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 2918 fflush(fp);
2919 fsync(fileno(fp));
2920 fclose(fp);
2921
2922 /* Use RENAME to make sure the DB file is changed atomically only
2923 * if the generate DB file is ok. */
2924 if (rename(tmpfile,filename) == -1) {
325d1eb4 2925 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 2926 unlink(tmpfile);
2927 return REDIS_ERR;
2928 }
2929 redisLog(REDIS_NOTICE,"DB saved on disk");
2930 server.dirty = 0;
2931 server.lastsave = time(NULL);
2932 return REDIS_OK;
2933
2934werr:
2935 fclose(fp);
2936 unlink(tmpfile);
2937 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
2938 if (di) dictReleaseIterator(di);
2939 return REDIS_ERR;
2940}
2941
f78fd11b 2942static int rdbSaveBackground(char *filename) {
ed9b544e 2943 pid_t childpid;
2944
9d65a1bb 2945 if (server.bgsavechildpid != -1) return REDIS_ERR;
ed9b544e 2946 if ((childpid = fork()) == 0) {
2947 /* Child */
2948 close(server.fd);
f78fd11b 2949 if (rdbSave(filename) == REDIS_OK) {
ed9b544e 2950 exit(0);
2951 } else {
2952 exit(1);
2953 }
2954 } else {
2955 /* Parent */
5a7c647e 2956 if (childpid == -1) {
2957 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
2958 strerror(errno));
2959 return REDIS_ERR;
2960 }
ed9b544e 2961 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 2962 server.bgsavechildpid = childpid;
ed9b544e 2963 return REDIS_OK;
2964 }
2965 return REDIS_OK; /* unreached */
2966}
2967
a3b21203 2968static void rdbRemoveTempFile(pid_t childpid) {
2969 char tmpfile[256];
2970
2971 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
2972 unlink(tmpfile);
2973}
2974
f78fd11b 2975static int rdbLoadType(FILE *fp) {
2976 unsigned char type;
7b45bfb2 2977 if (fread(&type,1,1,fp) == 0) return -1;
2978 return type;
2979}
2980
bb32ede5 2981static time_t rdbLoadTime(FILE *fp) {
2982 int32_t t32;
2983 if (fread(&t32,4,1,fp) == 0) return -1;
2984 return (time_t) t32;
2985}
2986
e3566d4b 2987/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
2988 * of this file for a description of how this are stored on disk.
2989 *
2990 * isencoded is set to 1 if the readed length is not actually a length but
2991 * an "encoding type", check the above comments for more info */
c78a8ccc 2992static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 2993 unsigned char buf[2];
2994 uint32_t len;
c78a8ccc 2995 int type;
f78fd11b 2996
e3566d4b 2997 if (isencoded) *isencoded = 0;
c78a8ccc 2998 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
2999 type = (buf[0]&0xC0)>>6;
3000 if (type == REDIS_RDB_6BITLEN) {
3001 /* Read a 6 bit len */
3002 return buf[0]&0x3F;
3003 } else if (type == REDIS_RDB_ENCVAL) {
3004 /* Read a 6 bit len encoding type */
3005 if (isencoded) *isencoded = 1;
3006 return buf[0]&0x3F;
3007 } else if (type == REDIS_RDB_14BITLEN) {
3008 /* Read a 14 bit len */
3009 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3010 return ((buf[0]&0x3F)<<8)|buf[1];
3011 } else {
3012 /* Read a 32 bit len */
f78fd11b 3013 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3014 return ntohl(len);
f78fd11b 3015 }
f78fd11b 3016}
3017
e3566d4b 3018static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3019 unsigned char enc[4];
3020 long long val;
3021
3022 if (enctype == REDIS_RDB_ENC_INT8) {
3023 if (fread(enc,1,1,fp) == 0) return NULL;
3024 val = (signed char)enc[0];
3025 } else if (enctype == REDIS_RDB_ENC_INT16) {
3026 uint16_t v;
3027 if (fread(enc,2,1,fp) == 0) return NULL;
3028 v = enc[0]|(enc[1]<<8);
3029 val = (int16_t)v;
3030 } else if (enctype == REDIS_RDB_ENC_INT32) {
3031 uint32_t v;
3032 if (fread(enc,4,1,fp) == 0) return NULL;
3033 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3034 val = (int32_t)v;
3035 } else {
3036 val = 0; /* anti-warning */
dfc5e96c 3037 redisAssert(0!=0);
e3566d4b 3038 }
3039 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3040}
3041
c78a8ccc 3042static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3043 unsigned int len, clen;
3044 unsigned char *c = NULL;
3045 sds val = NULL;
3046
c78a8ccc 3047 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3048 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3049 if ((c = zmalloc(clen)) == NULL) goto err;
3050 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3051 if (fread(c,clen,1,fp) == 0) goto err;
3052 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3053 zfree(c);
88e85998 3054 return createObject(REDIS_STRING,val);
3055err:
3056 zfree(c);
3057 sdsfree(val);
3058 return NULL;
3059}
3060
c78a8ccc 3061static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3062 int isencoded;
3063 uint32_t len;
f78fd11b 3064 sds val;
3065
c78a8ccc 3066 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3067 if (isencoded) {
3068 switch(len) {
3069 case REDIS_RDB_ENC_INT8:
3070 case REDIS_RDB_ENC_INT16:
3071 case REDIS_RDB_ENC_INT32:
3305306f 3072 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
88e85998 3073 case REDIS_RDB_ENC_LZF:
c78a8ccc 3074 return tryObjectSharing(rdbLoadLzfStringObject(fp));
e3566d4b 3075 default:
dfc5e96c 3076 redisAssert(0!=0);
e3566d4b 3077 }
3078 }
3079
f78fd11b 3080 if (len == REDIS_RDB_LENERR) return NULL;
3081 val = sdsnewlen(NULL,len);
3082 if (len && fread(val,len,1,fp) == 0) {
3083 sdsfree(val);
3084 return NULL;
3085 }
10c43610 3086 return tryObjectSharing(createObject(REDIS_STRING,val));
f78fd11b 3087}
3088
a7866db6 3089/* For information about double serialization check rdbSaveDoubleValue() */
3090static int rdbLoadDoubleValue(FILE *fp, double *val) {
3091 char buf[128];
3092 unsigned char len;
3093
3094 if (fread(&len,1,1,fp) == 0) return -1;
3095 switch(len) {
3096 case 255: *val = R_NegInf; return 0;
3097 case 254: *val = R_PosInf; return 0;
3098 case 253: *val = R_Nan; return 0;
3099 default:
3100 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3101 buf[len] = '\0';
a7866db6 3102 sscanf(buf, "%lg", val);
3103 return 0;
3104 }
3105}
3106
c78a8ccc 3107/* Load a Redis object of the specified type from the specified file.
3108 * On success a newly allocated object is returned, otherwise NULL. */
3109static robj *rdbLoadObject(int type, FILE *fp) {
3110 robj *o;
3111
3112 if (type == REDIS_STRING) {
3113 /* Read string value */
3114 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3115 tryObjectEncoding(o);
3116 } else if (type == REDIS_LIST || type == REDIS_SET) {
3117 /* Read list/set value */
3118 uint32_t listlen;
3119
3120 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3121 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3122 /* Load every single element of the list/set */
3123 while(listlen--) {
3124 robj *ele;
3125
3126 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3127 tryObjectEncoding(ele);
3128 if (type == REDIS_LIST) {
3129 listAddNodeTail((list*)o->ptr,ele);
3130 } else {
3131 dictAdd((dict*)o->ptr,ele,NULL);
3132 }
3133 }
3134 } else if (type == REDIS_ZSET) {
3135 /* Read list/set value */
3136 uint32_t zsetlen;
3137 zset *zs;
3138
3139 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3140 o = createZsetObject();
3141 zs = o->ptr;
3142 /* Load every single element of the list/set */
3143 while(zsetlen--) {
3144 robj *ele;
3145 double *score = zmalloc(sizeof(double));
3146
3147 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3148 tryObjectEncoding(ele);
3149 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3150 dictAdd(zs->dict,ele,score);
3151 zslInsert(zs->zsl,*score,ele);
3152 incrRefCount(ele); /* added to skiplist */
3153 }
3154 } else {
3155 redisAssert(0 != 0);
3156 }
3157 return o;
3158}
3159
f78fd11b 3160static int rdbLoad(char *filename) {
ed9b544e 3161 FILE *fp;
f78fd11b 3162 robj *keyobj = NULL;
3163 uint32_t dbid;
bb32ede5 3164 int type, retval, rdbver;
3305306f 3165 dict *d = server.db[0].dict;
bb32ede5 3166 redisDb *db = server.db+0;
f78fd11b 3167 char buf[1024];
bb32ede5 3168 time_t expiretime = -1, now = time(NULL);
3169
ed9b544e 3170 fp = fopen(filename,"r");
3171 if (!fp) return REDIS_ERR;
3172 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3173 buf[9] = '\0';
3174 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3175 fclose(fp);
3176 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3177 return REDIS_ERR;
3178 }
f78fd11b 3179 rdbver = atoi(buf+5);
c78a8ccc 3180 if (rdbver != 1) {
f78fd11b 3181 fclose(fp);
3182 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3183 return REDIS_ERR;
3184 }
ed9b544e 3185 while(1) {
3186 robj *o;
3187
3188 /* Read type. */
f78fd11b 3189 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3190 if (type == REDIS_EXPIRETIME) {
3191 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3192 /* We read the time so we need to read the object type again */
3193 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3194 }
ed9b544e 3195 if (type == REDIS_EOF) break;
3196 /* Handle SELECT DB opcode as a special case */
3197 if (type == REDIS_SELECTDB) {
c78a8ccc 3198 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3199 goto eoferr;
ed9b544e 3200 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3201 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3202 exit(1);
3203 }
bb32ede5 3204 db = server.db+dbid;
3205 d = db->dict;
ed9b544e 3206 continue;
3207 }
3208 /* Read key */
c78a8ccc 3209 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3210 /* Read value */
3211 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3212 /* Add the new object in the hash table */
f78fd11b 3213 retval = dictAdd(d,keyobj,o);
ed9b544e 3214 if (retval == DICT_ERR) {
f78fd11b 3215 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3216 exit(1);
3217 }
bb32ede5 3218 /* Set the expire time if needed */
3219 if (expiretime != -1) {
3220 setExpire(db,keyobj,expiretime);
3221 /* Delete this key if already expired */
3222 if (expiretime < now) deleteKey(db,keyobj);
3223 expiretime = -1;
3224 }
f78fd11b 3225 keyobj = o = NULL;
ed9b544e 3226 }
3227 fclose(fp);
3228 return REDIS_OK;
3229
3230eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3231 if (keyobj) decrRefCount(keyobj);
f80dff62 3232 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3233 exit(1);
3234 return REDIS_ERR; /* Just to avoid warning */
3235}
3236
3237/*================================== Commands =============================== */
3238
abcb223e 3239static void authCommand(redisClient *c) {
2e77c2ee 3240 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3241 c->authenticated = 1;
3242 addReply(c,shared.ok);
3243 } else {
3244 c->authenticated = 0;
fa4c0aba 3245 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3246 }
3247}
3248
ed9b544e 3249static void pingCommand(redisClient *c) {
3250 addReply(c,shared.pong);
3251}
3252
3253static void echoCommand(redisClient *c) {
942a3961 3254 addReplyBulkLen(c,c->argv[1]);
ed9b544e 3255 addReply(c,c->argv[1]);
3256 addReply(c,shared.crlf);
3257}
3258
3259/*=================================== Strings =============================== */
3260
3261static void setGenericCommand(redisClient *c, int nx) {
3262 int retval;
3263
333fd216 3264 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3265 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3266 if (retval == DICT_ERR) {
3267 if (!nx) {
1b03836c 3268 /* If the key is about a swapped value, we want a new key object
3269 * to overwrite the old. So we delete the old key in the database.
3270 * This will also make sure that swap pages about the old object
3271 * will be marked as free. */
3272 if (deleteIfSwapped(c->db,c->argv[1]))
3273 incrRefCount(c->argv[1]);
3305306f 3274 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3275 incrRefCount(c->argv[2]);
3276 } else {
c937aa89 3277 addReply(c,shared.czero);
ed9b544e 3278 return;
3279 }
3280 } else {
3281 incrRefCount(c->argv[1]);
3282 incrRefCount(c->argv[2]);
3283 }
3284 server.dirty++;
3305306f 3285 removeExpire(c->db,c->argv[1]);
c937aa89 3286 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3287}
3288
3289static void setCommand(redisClient *c) {
a4d1ba9a 3290 setGenericCommand(c,0);
ed9b544e 3291}
3292
3293static void setnxCommand(redisClient *c) {
a4d1ba9a 3294 setGenericCommand(c,1);
ed9b544e 3295}
3296
322fc7d8 3297static int getGenericCommand(redisClient *c) {
3305306f 3298 robj *o = lookupKeyRead(c->db,c->argv[1]);
3299
3300 if (o == NULL) {
c937aa89 3301 addReply(c,shared.nullbulk);
322fc7d8 3302 return REDIS_OK;
ed9b544e 3303 } else {
ed9b544e 3304 if (o->type != REDIS_STRING) {
c937aa89 3305 addReply(c,shared.wrongtypeerr);
322fc7d8 3306 return REDIS_ERR;
ed9b544e 3307 } else {
942a3961 3308 addReplyBulkLen(c,o);
ed9b544e 3309 addReply(c,o);
3310 addReply(c,shared.crlf);
322fc7d8 3311 return REDIS_OK;
ed9b544e 3312 }
3313 }
3314}
3315
322fc7d8 3316static void getCommand(redisClient *c) {
3317 getGenericCommand(c);
3318}
3319
f6b141c5 3320static void getsetCommand(redisClient *c) {
322fc7d8 3321 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3322 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3323 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3324 } else {
3325 incrRefCount(c->argv[1]);
3326 }
3327 incrRefCount(c->argv[2]);
3328 server.dirty++;
3329 removeExpire(c->db,c->argv[1]);
3330}
3331
70003d28 3332static void mgetCommand(redisClient *c) {
70003d28 3333 int j;
3334
c937aa89 3335 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3336 for (j = 1; j < c->argc; j++) {
3305306f 3337 robj *o = lookupKeyRead(c->db,c->argv[j]);
3338 if (o == NULL) {
c937aa89 3339 addReply(c,shared.nullbulk);
70003d28 3340 } else {
70003d28 3341 if (o->type != REDIS_STRING) {
c937aa89 3342 addReply(c,shared.nullbulk);
70003d28 3343 } else {
942a3961 3344 addReplyBulkLen(c,o);
70003d28 3345 addReply(c,o);
3346 addReply(c,shared.crlf);
3347 }
3348 }
3349 }
3350}
3351
6c446631 3352static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3353 int j, busykeys = 0;
6c446631 3354
3355 if ((c->argc % 2) == 0) {
454d4e43 3356 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3357 return;
3358 }
3359 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3360 * set nothing at all if at least one already key exists. */
3361 if (nx) {
3362 for (j = 1; j < c->argc; j += 2) {
906573e7 3363 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3364 busykeys++;
6c446631 3365 }
3366 }
3367 }
906573e7 3368 if (busykeys) {
3369 addReply(c, shared.czero);
3370 return;
3371 }
6c446631 3372
3373 for (j = 1; j < c->argc; j += 2) {
3374 int retval;
3375
17511391 3376 tryObjectEncoding(c->argv[j+1]);
6c446631 3377 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3378 if (retval == DICT_ERR) {
3379 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3380 incrRefCount(c->argv[j+1]);
3381 } else {
3382 incrRefCount(c->argv[j]);
3383 incrRefCount(c->argv[j+1]);
3384 }
3385 removeExpire(c->db,c->argv[j]);
3386 }
3387 server.dirty += (c->argc-1)/2;
3388 addReply(c, nx ? shared.cone : shared.ok);
3389}
3390
3391static void msetCommand(redisClient *c) {
3392 msetGenericCommand(c,0);
3393}
3394
3395static void msetnxCommand(redisClient *c) {
3396 msetGenericCommand(c,1);
3397}
3398
d68ed120 3399static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 3400 long long value;
3401 int retval;
3402 robj *o;
3403
3305306f 3404 o = lookupKeyWrite(c->db,c->argv[1]);
3405 if (o == NULL) {
ed9b544e 3406 value = 0;
3407 } else {
ed9b544e 3408 if (o->type != REDIS_STRING) {
3409 value = 0;
3410 } else {
3411 char *eptr;
3412
942a3961 3413 if (o->encoding == REDIS_ENCODING_RAW)
3414 value = strtoll(o->ptr, &eptr, 10);
3415 else if (o->encoding == REDIS_ENCODING_INT)
3416 value = (long)o->ptr;
3417 else
dfc5e96c 3418 redisAssert(1 != 1);
ed9b544e 3419 }
3420 }
3421
3422 value += incr;
3423 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
942a3961 3424 tryObjectEncoding(o);
3305306f 3425 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 3426 if (retval == DICT_ERR) {
3305306f 3427 dictReplace(c->db->dict,c->argv[1],o);
3428 removeExpire(c->db,c->argv[1]);
ed9b544e 3429 } else {
3430 incrRefCount(c->argv[1]);
3431 }
3432 server.dirty++;
c937aa89 3433 addReply(c,shared.colon);
ed9b544e 3434 addReply(c,o);
3435 addReply(c,shared.crlf);
3436}
3437
3438static void incrCommand(redisClient *c) {
a4d1ba9a 3439 incrDecrCommand(c,1);
ed9b544e 3440}
3441
3442static void decrCommand(redisClient *c) {
a4d1ba9a 3443 incrDecrCommand(c,-1);
ed9b544e 3444}
3445
3446static void incrbyCommand(redisClient *c) {
d68ed120 3447 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3448 incrDecrCommand(c,incr);
ed9b544e 3449}
3450
3451static void decrbyCommand(redisClient *c) {
d68ed120 3452 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3453 incrDecrCommand(c,-incr);
ed9b544e 3454}
3455
3456/* ========================= Type agnostic commands ========================= */
3457
3458static void delCommand(redisClient *c) {
5109cdff 3459 int deleted = 0, j;
3460
3461 for (j = 1; j < c->argc; j++) {
3462 if (deleteKey(c->db,c->argv[j])) {
3463 server.dirty++;
3464 deleted++;
3465 }
3466 }
3467 switch(deleted) {
3468 case 0:
c937aa89 3469 addReply(c,shared.czero);
5109cdff 3470 break;
3471 case 1:
3472 addReply(c,shared.cone);
3473 break;
3474 default:
3475 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3476 break;
ed9b544e 3477 }
3478}
3479
3480static void existsCommand(redisClient *c) {
3305306f 3481 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 3482}
3483
3484static void selectCommand(redisClient *c) {
3485 int id = atoi(c->argv[1]->ptr);
3486
3487 if (selectDb(c,id) == REDIS_ERR) {
774e3047 3488 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 3489 } else {
3490 addReply(c,shared.ok);
3491 }
3492}
3493
3494static void randomkeyCommand(redisClient *c) {
3495 dictEntry *de;
3305306f 3496
3497 while(1) {
3498 de = dictGetRandomKey(c->db->dict);
ce7bef07 3499 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 3500 }
ed9b544e 3501 if (de == NULL) {
ce7bef07 3502 addReply(c,shared.plus);
ed9b544e 3503 addReply(c,shared.crlf);
3504 } else {
c937aa89 3505 addReply(c,shared.plus);
ed9b544e 3506 addReply(c,dictGetEntryKey(de));
3507 addReply(c,shared.crlf);
3508 }
3509}
3510
3511static void keysCommand(redisClient *c) {
3512 dictIterator *di;
3513 dictEntry *de;
3514 sds pattern = c->argv[1]->ptr;
3515 int plen = sdslen(pattern);
682ac724 3516 unsigned long numkeys = 0, keyslen = 0;
ed9b544e 3517 robj *lenobj = createObject(REDIS_STRING,NULL);
3518
3305306f 3519 di = dictGetIterator(c->db->dict);
ed9b544e 3520 addReply(c,lenobj);
3521 decrRefCount(lenobj);
3522 while((de = dictNext(di)) != NULL) {
3523 robj *keyobj = dictGetEntryKey(de);
3305306f 3524
ed9b544e 3525 sds key = keyobj->ptr;
3526 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3527 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 3528 if (expireIfNeeded(c->db,keyobj) == 0) {
3529 if (numkeys != 0)
3530 addReply(c,shared.space);
3531 addReply(c,keyobj);
3532 numkeys++;
3533 keyslen += sdslen(key);
3534 }
ed9b544e 3535 }
3536 }
3537 dictReleaseIterator(di);
c937aa89 3538 lenobj->ptr = sdscatprintf(sdsempty(),"$%lu\r\n",keyslen+(numkeys ? (numkeys-1) : 0));
ed9b544e 3539 addReply(c,shared.crlf);
3540}
3541
3542static void dbsizeCommand(redisClient *c) {
3543 addReplySds(c,
3305306f 3544 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 3545}
3546
3547static void lastsaveCommand(redisClient *c) {
3548 addReplySds(c,
c937aa89 3549 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 3550}
3551
3552static void typeCommand(redisClient *c) {
3305306f 3553 robj *o;
ed9b544e 3554 char *type;
3305306f 3555
3556 o = lookupKeyRead(c->db,c->argv[1]);
3557 if (o == NULL) {
c937aa89 3558 type = "+none";
ed9b544e 3559 } else {
ed9b544e 3560 switch(o->type) {
c937aa89 3561 case REDIS_STRING: type = "+string"; break;
3562 case REDIS_LIST: type = "+list"; break;
3563 case REDIS_SET: type = "+set"; break;
412a8bce 3564 case REDIS_ZSET: type = "+zset"; break;
ed9b544e 3565 default: type = "unknown"; break;
3566 }
3567 }
3568 addReplySds(c,sdsnew(type));
3569 addReply(c,shared.crlf);
3570}
3571
3572static void saveCommand(redisClient *c) {
9d65a1bb 3573 if (server.bgsavechildpid != -1) {
05557f6d 3574 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3575 return;
3576 }
f78fd11b 3577 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 3578 addReply(c,shared.ok);
3579 } else {
3580 addReply(c,shared.err);
3581 }
3582}
3583
3584static void bgsaveCommand(redisClient *c) {
9d65a1bb 3585 if (server.bgsavechildpid != -1) {
ed9b544e 3586 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
3587 return;
3588 }
f78fd11b 3589 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 3590 char *status = "+Background saving started\r\n";
3591 addReplySds(c,sdsnew(status));
ed9b544e 3592 } else {
3593 addReply(c,shared.err);
3594 }
3595}
3596
3597static void shutdownCommand(redisClient *c) {
3598 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 3599 /* Kill the saving child if there is a background saving in progress.
3600 We want to avoid race conditions, for instance our saving child may
3601 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 3602 if (server.bgsavechildpid != -1) {
9f3c422c 3603 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
3604 kill(server.bgsavechildpid,SIGKILL);
a3b21203 3605 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 3606 }
ac945e2d 3607 if (server.appendonly) {
3608 /* Append only file: fsync() the AOF and exit */
3609 fsync(server.appendfd);
3610 exit(0);
ed9b544e 3611 } else {
ac945e2d 3612 /* Snapshotting. Perform a SYNC SAVE and exit */
3613 if (rdbSave(server.dbfilename) == REDIS_OK) {
3614 if (server.daemonize)
3615 unlink(server.pidfile);
3616 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
3617 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
3618 exit(0);
3619 } else {
3620 /* Ooops.. error saving! The best we can do is to continue operating.
3621 * Note that if there was a background saving process, in the next
3622 * cron() Redis will be notified that the background saving aborted,
3623 * handling special stuff like slaves pending for synchronization... */
3624 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
3625 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
3626 }
ed9b544e 3627 }
3628}
3629
3630static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 3631 robj *o;
3632
3633 /* To use the same key as src and dst is probably an error */
3634 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 3635 addReply(c,shared.sameobjecterr);
ed9b544e 3636 return;
3637 }
3638
3305306f 3639 o = lookupKeyWrite(c->db,c->argv[1]);
3640 if (o == NULL) {
c937aa89 3641 addReply(c,shared.nokeyerr);
ed9b544e 3642 return;
3643 }
ed9b544e 3644 incrRefCount(o);
3305306f 3645 deleteIfVolatile(c->db,c->argv[2]);
3646 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 3647 if (nx) {
3648 decrRefCount(o);
c937aa89 3649 addReply(c,shared.czero);
ed9b544e 3650 return;
3651 }
3305306f 3652 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 3653 } else {
3654 incrRefCount(c->argv[2]);
3655 }
3305306f 3656 deleteKey(c->db,c->argv[1]);
ed9b544e 3657 server.dirty++;
c937aa89 3658 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 3659}
3660
3661static void renameCommand(redisClient *c) {
3662 renameGenericCommand(c,0);
3663}
3664
3665static void renamenxCommand(redisClient *c) {
3666 renameGenericCommand(c,1);
3667}
3668
3669static void moveCommand(redisClient *c) {
3305306f 3670 robj *o;
3671 redisDb *src, *dst;
ed9b544e 3672 int srcid;
3673
3674 /* Obtain source and target DB pointers */
3305306f 3675 src = c->db;
3676 srcid = c->db->id;
ed9b544e 3677 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 3678 addReply(c,shared.outofrangeerr);
ed9b544e 3679 return;
3680 }
3305306f 3681 dst = c->db;
3682 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 3683
3684 /* If the user is moving using as target the same
3685 * DB as the source DB it is probably an error. */
3686 if (src == dst) {
c937aa89 3687 addReply(c,shared.sameobjecterr);
ed9b544e 3688 return;
3689 }
3690
3691 /* Check if the element exists and get a reference */
3305306f 3692 o = lookupKeyWrite(c->db,c->argv[1]);
3693 if (!o) {
c937aa89 3694 addReply(c,shared.czero);
ed9b544e 3695 return;
3696 }
3697
3698 /* Try to add the element to the target DB */
3305306f 3699 deleteIfVolatile(dst,c->argv[1]);
3700 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 3701 addReply(c,shared.czero);
ed9b544e 3702 return;
3703 }
3305306f 3704 incrRefCount(c->argv[1]);
ed9b544e 3705 incrRefCount(o);
3706
3707 /* OK! key moved, free the entry in the source DB */
3305306f 3708 deleteKey(src,c->argv[1]);
ed9b544e 3709 server.dirty++;
c937aa89 3710 addReply(c,shared.cone);
ed9b544e 3711}
3712
3713/* =================================== Lists ================================ */
3714static void pushGenericCommand(redisClient *c, int where) {
3715 robj *lobj;
ed9b544e 3716 list *list;
3305306f 3717
3718 lobj = lookupKeyWrite(c->db,c->argv[1]);
3719 if (lobj == NULL) {
95242ab5 3720 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
3721 addReply(c,shared.ok);
3722 return;
3723 }
ed9b544e 3724 lobj = createListObject();
3725 list = lobj->ptr;
3726 if (where == REDIS_HEAD) {
6b47e12e 3727 listAddNodeHead(list,c->argv[2]);
ed9b544e 3728 } else {
6b47e12e 3729 listAddNodeTail(list,c->argv[2]);
ed9b544e 3730 }
3305306f 3731 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 3732 incrRefCount(c->argv[1]);
3733 incrRefCount(c->argv[2]);
3734 } else {
ed9b544e 3735 if (lobj->type != REDIS_LIST) {
3736 addReply(c,shared.wrongtypeerr);
3737 return;
3738 }
95242ab5 3739 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
3740 addReply(c,shared.ok);
3741 return;
3742 }
ed9b544e 3743 list = lobj->ptr;
3744 if (where == REDIS_HEAD) {
6b47e12e 3745 listAddNodeHead(list,c->argv[2]);
ed9b544e 3746 } else {
6b47e12e 3747 listAddNodeTail(list,c->argv[2]);
ed9b544e 3748 }
3749 incrRefCount(c->argv[2]);
3750 }
3751 server.dirty++;
3752 addReply(c,shared.ok);
3753}
3754
3755static void lpushCommand(redisClient *c) {
3756 pushGenericCommand(c,REDIS_HEAD);
3757}
3758
3759static void rpushCommand(redisClient *c) {
3760 pushGenericCommand(c,REDIS_TAIL);
3761}
3762
3763static void llenCommand(redisClient *c) {
3305306f 3764 robj *o;
ed9b544e 3765 list *l;
3766
3305306f 3767 o = lookupKeyRead(c->db,c->argv[1]);
3768 if (o == NULL) {
c937aa89 3769 addReply(c,shared.czero);
ed9b544e 3770 return;
3771 } else {
ed9b544e 3772 if (o->type != REDIS_LIST) {
c937aa89 3773 addReply(c,shared.wrongtypeerr);
ed9b544e 3774 } else {
3775 l = o->ptr;
c937aa89 3776 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
ed9b544e 3777 }
3778 }
3779}
3780
3781static void lindexCommand(redisClient *c) {
3305306f 3782 robj *o;
ed9b544e 3783 int index = atoi(c->argv[2]->ptr);
3784
3305306f 3785 o = lookupKeyRead(c->db,c->argv[1]);
3786 if (o == NULL) {
c937aa89 3787 addReply(c,shared.nullbulk);
ed9b544e 3788 } else {
ed9b544e 3789 if (o->type != REDIS_LIST) {
c937aa89 3790 addReply(c,shared.wrongtypeerr);
ed9b544e 3791 } else {
3792 list *list = o->ptr;
3793 listNode *ln;
3794
3795 ln = listIndex(list, index);
3796 if (ln == NULL) {
c937aa89 3797 addReply(c,shared.nullbulk);
ed9b544e 3798 } else {
3799 robj *ele = listNodeValue(ln);
942a3961 3800 addReplyBulkLen(c,ele);
ed9b544e 3801 addReply(c,ele);
3802 addReply(c,shared.crlf);
3803 }
3804 }
3805 }
3806}
3807
3808static void lsetCommand(redisClient *c) {
3305306f 3809 robj *o;
ed9b544e 3810 int index = atoi(c->argv[2]->ptr);
3811
3305306f 3812 o = lookupKeyWrite(c->db,c->argv[1]);
3813 if (o == NULL) {
ed9b544e 3814 addReply(c,shared.nokeyerr);
3815 } else {
ed9b544e 3816 if (o->type != REDIS_LIST) {
3817 addReply(c,shared.wrongtypeerr);
3818 } else {
3819 list *list = o->ptr;
3820 listNode *ln;
3821
3822 ln = listIndex(list, index);
3823 if (ln == NULL) {
c937aa89 3824 addReply(c,shared.outofrangeerr);
ed9b544e 3825 } else {
3826 robj *ele = listNodeValue(ln);
3827
3828 decrRefCount(ele);
3829 listNodeValue(ln) = c->argv[3];
3830 incrRefCount(c->argv[3]);
3831 addReply(c,shared.ok);
3832 server.dirty++;
3833 }
3834 }
3835 }
3836}
3837
3838static void popGenericCommand(redisClient *c, int where) {
3305306f 3839 robj *o;
3840
3841 o = lookupKeyWrite(c->db,c->argv[1]);
3842 if (o == NULL) {
c937aa89 3843 addReply(c,shared.nullbulk);
ed9b544e 3844 } else {
ed9b544e 3845 if (o->type != REDIS_LIST) {
c937aa89 3846 addReply(c,shared.wrongtypeerr);
ed9b544e 3847 } else {
3848 list *list = o->ptr;
3849 listNode *ln;
3850
3851 if (where == REDIS_HEAD)
3852 ln = listFirst(list);
3853 else
3854 ln = listLast(list);
3855
3856 if (ln == NULL) {
c937aa89 3857 addReply(c,shared.nullbulk);
ed9b544e 3858 } else {
3859 robj *ele = listNodeValue(ln);
942a3961 3860 addReplyBulkLen(c,ele);
ed9b544e 3861 addReply(c,ele);
3862 addReply(c,shared.crlf);
3863 listDelNode(list,ln);
3864 server.dirty++;
3865 }
3866 }
3867 }
3868}
3869
3870static void lpopCommand(redisClient *c) {
3871 popGenericCommand(c,REDIS_HEAD);
3872}
3873
3874static void rpopCommand(redisClient *c) {
3875 popGenericCommand(c,REDIS_TAIL);
3876}
3877
3878static void lrangeCommand(redisClient *c) {
3305306f 3879 robj *o;
ed9b544e 3880 int start = atoi(c->argv[2]->ptr);
3881 int end = atoi(c->argv[3]->ptr);
3305306f 3882
3883 o = lookupKeyRead(c->db,c->argv[1]);
3884 if (o == NULL) {
c937aa89 3885 addReply(c,shared.nullmultibulk);
ed9b544e 3886 } else {
ed9b544e 3887 if (o->type != REDIS_LIST) {
c937aa89 3888 addReply(c,shared.wrongtypeerr);
ed9b544e 3889 } else {
3890 list *list = o->ptr;
3891 listNode *ln;
3892 int llen = listLength(list);
3893 int rangelen, j;
3894 robj *ele;
3895
3896 /* convert negative indexes */
3897 if (start < 0) start = llen+start;
3898 if (end < 0) end = llen+end;
3899 if (start < 0) start = 0;
3900 if (end < 0) end = 0;
3901
3902 /* indexes sanity checks */
3903 if (start > end || start >= llen) {
3904 /* Out of range start or start > end result in empty list */
c937aa89 3905 addReply(c,shared.emptymultibulk);
ed9b544e 3906 return;
3907 }
3908 if (end >= llen) end = llen-1;
3909 rangelen = (end-start)+1;
3910
3911 /* Return the result in form of a multi-bulk reply */
3912 ln = listIndex(list, start);
c937aa89 3913 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
ed9b544e 3914 for (j = 0; j < rangelen; j++) {
3915 ele = listNodeValue(ln);
942a3961 3916 addReplyBulkLen(c,ele);
ed9b544e 3917 addReply(c,ele);
3918 addReply(c,shared.crlf);
3919 ln = ln->next;
3920 }
3921 }
3922 }
3923}
3924
3925static void ltrimCommand(redisClient *c) {
3305306f 3926 robj *o;
ed9b544e 3927 int start = atoi(c->argv[2]->ptr);
3928 int end = atoi(c->argv[3]->ptr);
3929
3305306f 3930 o = lookupKeyWrite(c->db,c->argv[1]);
3931 if (o == NULL) {
ab9d4cb1 3932 addReply(c,shared.ok);
ed9b544e 3933 } else {
ed9b544e 3934 if (o->type != REDIS_LIST) {
3935 addReply(c,shared.wrongtypeerr);
3936 } else {
3937 list *list = o->ptr;
3938 listNode *ln;
3939 int llen = listLength(list);
3940 int j, ltrim, rtrim;
3941
3942 /* convert negative indexes */
3943 if (start < 0) start = llen+start;
3944 if (end < 0) end = llen+end;
3945 if (start < 0) start = 0;
3946 if (end < 0) end = 0;
3947
3948 /* indexes sanity checks */
3949 if (start > end || start >= llen) {
3950 /* Out of range start or start > end result in empty list */
3951 ltrim = llen;
3952 rtrim = 0;
3953 } else {
3954 if (end >= llen) end = llen-1;
3955 ltrim = start;
3956 rtrim = llen-end-1;
3957 }
3958
3959 /* Remove list elements to perform the trim */
3960 for (j = 0; j < ltrim; j++) {
3961 ln = listFirst(list);
3962 listDelNode(list,ln);
3963 }
3964 for (j = 0; j < rtrim; j++) {
3965 ln = listLast(list);
3966 listDelNode(list,ln);
3967 }
ed9b544e 3968 server.dirty++;
e59229a2 3969 addReply(c,shared.ok);
ed9b544e 3970 }
3971 }
3972}
3973
3974static void lremCommand(redisClient *c) {
3305306f 3975 robj *o;
ed9b544e 3976
3305306f 3977 o = lookupKeyWrite(c->db,c->argv[1]);
3978 if (o == NULL) {
33c08b39 3979 addReply(c,shared.czero);
ed9b544e 3980 } else {
ed9b544e 3981 if (o->type != REDIS_LIST) {
c937aa89 3982 addReply(c,shared.wrongtypeerr);
ed9b544e 3983 } else {
3984 list *list = o->ptr;
3985 listNode *ln, *next;
3986 int toremove = atoi(c->argv[2]->ptr);
3987 int removed = 0;
3988 int fromtail = 0;
3989
3990 if (toremove < 0) {
3991 toremove = -toremove;
3992 fromtail = 1;
3993 }
3994 ln = fromtail ? list->tail : list->head;
3995 while (ln) {
ed9b544e 3996 robj *ele = listNodeValue(ln);
a4d1ba9a 3997
3998 next = fromtail ? ln->prev : ln->next;
724a51b1 3999 if (compareStringObjects(ele,c->argv[3]) == 0) {
ed9b544e 4000 listDelNode(list,ln);
4001 server.dirty++;
4002 removed++;
4003 if (toremove && removed == toremove) break;
4004 }
4005 ln = next;
4006 }
c937aa89 4007 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4008 }
4009 }
4010}
4011
12f9d551 4012/* This is the semantic of this command:
0f5f7e9a 4013 * RPOPLPUSH srclist dstlist:
12f9d551 4014 * IF LLEN(srclist) > 0
4015 * element = RPOP srclist
4016 * LPUSH dstlist element
4017 * RETURN element
4018 * ELSE
4019 * RETURN nil
4020 * END
4021 * END
4022 *
4023 * The idea is to be able to get an element from a list in a reliable way
4024 * since the element is not just returned but pushed against another list
4025 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4026 */
0f5f7e9a 4027static void rpoplpushcommand(redisClient *c) {
12f9d551 4028 robj *sobj;
4029
4030 sobj = lookupKeyWrite(c->db,c->argv[1]);
4031 if (sobj == NULL) {
4032 addReply(c,shared.nullbulk);
4033 } else {
4034 if (sobj->type != REDIS_LIST) {
4035 addReply(c,shared.wrongtypeerr);
4036 } else {
4037 list *srclist = sobj->ptr;
4038 listNode *ln = listLast(srclist);
4039
4040 if (ln == NULL) {
4041 addReply(c,shared.nullbulk);
4042 } else {
4043 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4044 robj *ele = listNodeValue(ln);
4045 list *dstlist;
4046
e20fb74f 4047 if (dobj && dobj->type != REDIS_LIST) {
12f9d551 4048 addReply(c,shared.wrongtypeerr);
4049 return;
4050 }
e20fb74f 4051
4052 /* Add the element to the target list (unless it's directly
4053 * passed to some BLPOP-ing client */
4054 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4055 if (dobj == NULL) {
4056 /* Create the list if the key does not exist */
4057 dobj = createListObject();
4058 dictAdd(c->db->dict,c->argv[2],dobj);
4059 incrRefCount(c->argv[2]);
4060 }
4061 dstlist = dobj->ptr;
4062 listAddNodeHead(dstlist,ele);
4063 incrRefCount(ele);
4064 }
12f9d551 4065
4066 /* Send the element to the client as reply as well */
4067 addReplyBulkLen(c,ele);
4068 addReply(c,ele);
4069 addReply(c,shared.crlf);
4070
4071 /* Finally remove the element from the source list */
4072 listDelNode(srclist,ln);
4073 server.dirty++;
4074 }
4075 }
4076 }
4077}
4078
4079
ed9b544e 4080/* ==================================== Sets ================================ */
4081
4082static void saddCommand(redisClient *c) {
ed9b544e 4083 robj *set;
4084
3305306f 4085 set = lookupKeyWrite(c->db,c->argv[1]);
4086 if (set == NULL) {
ed9b544e 4087 set = createSetObject();
3305306f 4088 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4089 incrRefCount(c->argv[1]);
4090 } else {
ed9b544e 4091 if (set->type != REDIS_SET) {
c937aa89 4092 addReply(c,shared.wrongtypeerr);
ed9b544e 4093 return;
4094 }
4095 }
4096 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4097 incrRefCount(c->argv[2]);
4098 server.dirty++;
c937aa89 4099 addReply(c,shared.cone);
ed9b544e 4100 } else {
c937aa89 4101 addReply(c,shared.czero);
ed9b544e 4102 }
4103}
4104
4105static void sremCommand(redisClient *c) {
3305306f 4106 robj *set;
ed9b544e 4107
3305306f 4108 set = lookupKeyWrite(c->db,c->argv[1]);
4109 if (set == NULL) {
c937aa89 4110 addReply(c,shared.czero);
ed9b544e 4111 } else {
ed9b544e 4112 if (set->type != REDIS_SET) {
c937aa89 4113 addReply(c,shared.wrongtypeerr);
ed9b544e 4114 return;
4115 }
4116 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4117 server.dirty++;
12fea928 4118 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
c937aa89 4119 addReply(c,shared.cone);
ed9b544e 4120 } else {
c937aa89 4121 addReply(c,shared.czero);
ed9b544e 4122 }
4123 }
4124}
4125
a4460ef4 4126static void smoveCommand(redisClient *c) {
4127 robj *srcset, *dstset;
4128
4129 srcset = lookupKeyWrite(c->db,c->argv[1]);
4130 dstset = lookupKeyWrite(c->db,c->argv[2]);
4131
4132 /* If the source key does not exist return 0, if it's of the wrong type
4133 * raise an error */
4134 if (srcset == NULL || srcset->type != REDIS_SET) {
4135 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4136 return;
4137 }
4138 /* Error if the destination key is not a set as well */
4139 if (dstset && dstset->type != REDIS_SET) {
4140 addReply(c,shared.wrongtypeerr);
4141 return;
4142 }
4143 /* Remove the element from the source set */
4144 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4145 /* Key not found in the src set! return zero */
4146 addReply(c,shared.czero);
4147 return;
4148 }
4149 server.dirty++;
4150 /* Add the element to the destination set */
4151 if (!dstset) {
4152 dstset = createSetObject();
4153 dictAdd(c->db->dict,c->argv[2],dstset);
4154 incrRefCount(c->argv[2]);
4155 }
4156 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4157 incrRefCount(c->argv[3]);
4158 addReply(c,shared.cone);
4159}
4160
ed9b544e 4161static void sismemberCommand(redisClient *c) {
3305306f 4162 robj *set;
ed9b544e 4163
3305306f 4164 set = lookupKeyRead(c->db,c->argv[1]);
4165 if (set == NULL) {
c937aa89 4166 addReply(c,shared.czero);
ed9b544e 4167 } else {
ed9b544e 4168 if (set->type != REDIS_SET) {
c937aa89 4169 addReply(c,shared.wrongtypeerr);
ed9b544e 4170 return;
4171 }
4172 if (dictFind(set->ptr,c->argv[2]))
c937aa89 4173 addReply(c,shared.cone);
ed9b544e 4174 else
c937aa89 4175 addReply(c,shared.czero);
ed9b544e 4176 }
4177}
4178
4179static void scardCommand(redisClient *c) {
3305306f 4180 robj *o;
ed9b544e 4181 dict *s;
4182
3305306f 4183 o = lookupKeyRead(c->db,c->argv[1]);
4184 if (o == NULL) {
c937aa89 4185 addReply(c,shared.czero);
ed9b544e 4186 return;
4187 } else {
ed9b544e 4188 if (o->type != REDIS_SET) {
c937aa89 4189 addReply(c,shared.wrongtypeerr);
ed9b544e 4190 } else {
4191 s = o->ptr;
682ac724 4192 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
3305306f 4193 dictSize(s)));
ed9b544e 4194 }
4195 }
4196}
4197
12fea928 4198static void spopCommand(redisClient *c) {
4199 robj *set;
4200 dictEntry *de;
4201
4202 set = lookupKeyWrite(c->db,c->argv[1]);
4203 if (set == NULL) {
4204 addReply(c,shared.nullbulk);
4205 } else {
4206 if (set->type != REDIS_SET) {
4207 addReply(c,shared.wrongtypeerr);
4208 return;
4209 }
4210 de = dictGetRandomKey(set->ptr);
4211 if (de == NULL) {
4212 addReply(c,shared.nullbulk);
4213 } else {
4214 robj *ele = dictGetEntryKey(de);
4215
942a3961 4216 addReplyBulkLen(c,ele);
12fea928 4217 addReply(c,ele);
4218 addReply(c,shared.crlf);
4219 dictDelete(set->ptr,ele);
4220 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4221 server.dirty++;
4222 }
4223 }
4224}
4225
2abb95a9 4226static void srandmemberCommand(redisClient *c) {
4227 robj *set;
4228 dictEntry *de;
4229
4230 set = lookupKeyRead(c->db,c->argv[1]);
4231 if (set == NULL) {
4232 addReply(c,shared.nullbulk);
4233 } else {
4234 if (set->type != REDIS_SET) {
4235 addReply(c,shared.wrongtypeerr);
4236 return;
4237 }
4238 de = dictGetRandomKey(set->ptr);
4239 if (de == NULL) {
4240 addReply(c,shared.nullbulk);
4241 } else {
4242 robj *ele = dictGetEntryKey(de);
4243
4244 addReplyBulkLen(c,ele);
4245 addReply(c,ele);
4246 addReply(c,shared.crlf);
4247 }
4248 }
4249}
4250
ed9b544e 4251static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4252 dict **d1 = (void*) s1, **d2 = (void*) s2;
4253
3305306f 4254 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4255}
4256
682ac724 4257static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4258 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4259 dictIterator *di;
4260 dictEntry *de;
4261 robj *lenobj = NULL, *dstset = NULL;
682ac724 4262 unsigned long j, cardinality = 0;
ed9b544e 4263
ed9b544e 4264 for (j = 0; j < setsnum; j++) {
4265 robj *setobj;
3305306f 4266
4267 setobj = dstkey ?
4268 lookupKeyWrite(c->db,setskeys[j]) :
4269 lookupKeyRead(c->db,setskeys[j]);
4270 if (!setobj) {
ed9b544e 4271 zfree(dv);
5faa6025 4272 if (dstkey) {
fdcaae84 4273 if (deleteKey(c->db,dstkey))
4274 server.dirty++;
0d36ded0 4275 addReply(c,shared.czero);
5faa6025 4276 } else {
4277 addReply(c,shared.nullmultibulk);
4278 }
ed9b544e 4279 return;
4280 }
ed9b544e 4281 if (setobj->type != REDIS_SET) {
4282 zfree(dv);
c937aa89 4283 addReply(c,shared.wrongtypeerr);
ed9b544e 4284 return;
4285 }
4286 dv[j] = setobj->ptr;
4287 }
4288 /* Sort sets from the smallest to largest, this will improve our
4289 * algorithm's performace */
4290 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4291
4292 /* The first thing we should output is the total number of elements...
4293 * since this is a multi-bulk write, but at this stage we don't know
4294 * the intersection set size, so we use a trick, append an empty object
4295 * to the output list and save the pointer to later modify it with the
4296 * right length */
4297 if (!dstkey) {
4298 lenobj = createObject(REDIS_STRING,NULL);
4299 addReply(c,lenobj);
4300 decrRefCount(lenobj);
4301 } else {
4302 /* If we have a target key where to store the resulting set
4303 * create this key with an empty set inside */
4304 dstset = createSetObject();
ed9b544e 4305 }
4306
4307 /* Iterate all the elements of the first (smallest) set, and test
4308 * the element against all the other sets, if at least one set does
4309 * not include the element it is discarded */
4310 di = dictGetIterator(dv[0]);
ed9b544e 4311
4312 while((de = dictNext(di)) != NULL) {
4313 robj *ele;
4314
4315 for (j = 1; j < setsnum; j++)
4316 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4317 if (j != setsnum)
4318 continue; /* at least one set does not contain the member */
4319 ele = dictGetEntryKey(de);
4320 if (!dstkey) {
942a3961 4321 addReplyBulkLen(c,ele);
ed9b544e 4322 addReply(c,ele);
4323 addReply(c,shared.crlf);
4324 cardinality++;
4325 } else {
4326 dictAdd(dstset->ptr,ele,NULL);
4327 incrRefCount(ele);
4328 }
4329 }
4330 dictReleaseIterator(di);
4331
83cdfe18
AG
4332 if (dstkey) {
4333 /* Store the resulting set into the target */
4334 deleteKey(c->db,dstkey);
4335 dictAdd(c->db->dict,dstkey,dstset);
4336 incrRefCount(dstkey);
4337 }
4338
40d224a9 4339 if (!dstkey) {
682ac724 4340 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4341 } else {
682ac724 4342 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4343 dictSize((dict*)dstset->ptr)));
40d224a9 4344 server.dirty++;
4345 }
ed9b544e 4346 zfree(dv);
4347}
4348
4349static void sinterCommand(redisClient *c) {
4350 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4351}
4352
4353static void sinterstoreCommand(redisClient *c) {
4354 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4355}
4356
f4f56e1d 4357#define REDIS_OP_UNION 0
4358#define REDIS_OP_DIFF 1
4359
4360static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4361 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4362 dictIterator *di;
4363 dictEntry *de;
f4f56e1d 4364 robj *dstset = NULL;
40d224a9 4365 int j, cardinality = 0;
4366
40d224a9 4367 for (j = 0; j < setsnum; j++) {
4368 robj *setobj;
4369
4370 setobj = dstkey ?
4371 lookupKeyWrite(c->db,setskeys[j]) :
4372 lookupKeyRead(c->db,setskeys[j]);
4373 if (!setobj) {
4374 dv[j] = NULL;
4375 continue;
4376 }
4377 if (setobj->type != REDIS_SET) {
4378 zfree(dv);
4379 addReply(c,shared.wrongtypeerr);
4380 return;
4381 }
4382 dv[j] = setobj->ptr;
4383 }
4384
4385 /* We need a temp set object to store our union. If the dstkey
4386 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4387 * this set object will be the resulting object to set into the target key*/
4388 dstset = createSetObject();
4389
40d224a9 4390 /* Iterate all the elements of all the sets, add every element a single
4391 * time to the result set */
4392 for (j = 0; j < setsnum; j++) {
51829ed3 4393 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4394 if (!dv[j]) continue; /* non existing keys are like empty sets */
4395
4396 di = dictGetIterator(dv[j]);
40d224a9 4397
4398 while((de = dictNext(di)) != NULL) {
4399 robj *ele;
4400
4401 /* dictAdd will not add the same element multiple times */
4402 ele = dictGetEntryKey(de);
f4f56e1d 4403 if (op == REDIS_OP_UNION || j == 0) {
4404 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4405 incrRefCount(ele);
40d224a9 4406 cardinality++;
4407 }
f4f56e1d 4408 } else if (op == REDIS_OP_DIFF) {
4409 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4410 cardinality--;
4411 }
40d224a9 4412 }
4413 }
4414 dictReleaseIterator(di);
51829ed3
AG
4415
4416 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
40d224a9 4417 }
4418
f4f56e1d 4419 /* Output the content of the resulting set, if not in STORE mode */
4420 if (!dstkey) {
4421 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4422 di = dictGetIterator(dstset->ptr);
f4f56e1d 4423 while((de = dictNext(di)) != NULL) {
4424 robj *ele;
4425
4426 ele = dictGetEntryKey(de);
942a3961 4427 addReplyBulkLen(c,ele);
f4f56e1d 4428 addReply(c,ele);
4429 addReply(c,shared.crlf);
4430 }
4431 dictReleaseIterator(di);
83cdfe18
AG
4432 } else {
4433 /* If we have a target key where to store the resulting set
4434 * create this key with the result set inside */
4435 deleteKey(c->db,dstkey);
4436 dictAdd(c->db->dict,dstkey,dstset);
4437 incrRefCount(dstkey);
f4f56e1d 4438 }
4439
4440 /* Cleanup */
40d224a9 4441 if (!dstkey) {
40d224a9 4442 decrRefCount(dstset);
4443 } else {
682ac724 4444 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4445 dictSize((dict*)dstset->ptr)));
40d224a9 4446 server.dirty++;
4447 }
4448 zfree(dv);
4449}
4450
4451static void sunionCommand(redisClient *c) {
f4f56e1d 4452 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 4453}
4454
4455static void sunionstoreCommand(redisClient *c) {
f4f56e1d 4456 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4457}
4458
4459static void sdiffCommand(redisClient *c) {
4460 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4461}
4462
4463static void sdiffstoreCommand(redisClient *c) {
4464 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 4465}
4466
6b47e12e 4467/* ==================================== ZSets =============================== */
4468
4469/* ZSETs are ordered sets using two data structures to hold the same elements
4470 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4471 * data structure.
4472 *
4473 * The elements are added to an hash table mapping Redis objects to scores.
4474 * At the same time the elements are added to a skip list mapping scores
4475 * to Redis objects (so objects are sorted by scores in this "view"). */
4476
4477/* This skiplist implementation is almost a C translation of the original
4478 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4479 * Alternative to Balanced Trees", modified in three ways:
4480 * a) this implementation allows for repeated values.
4481 * b) the comparison is not just by key (our 'score') but by satellite data.
4482 * c) there is a back pointer, so it's a doubly linked list with the back
4483 * pointers being only at "level 1". This allows to traverse the list
4484 * from tail to head, useful for ZREVRANGE. */
4485
4486static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4487 zskiplistNode *zn = zmalloc(sizeof(*zn));
4488
4489 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4490 zn->score = score;
4491 zn->obj = obj;
4492 return zn;
4493}
4494
4495static zskiplist *zslCreate(void) {
4496 int j;
4497 zskiplist *zsl;
4498
4499 zsl = zmalloc(sizeof(*zsl));
4500 zsl->level = 1;
cc812361 4501 zsl->length = 0;
6b47e12e 4502 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4503 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++)
4504 zsl->header->forward[j] = NULL;
e3870fab 4505 zsl->header->backward = NULL;
4506 zsl->tail = NULL;
6b47e12e 4507 return zsl;
4508}
4509
fd8ccf44 4510static void zslFreeNode(zskiplistNode *node) {
4511 decrRefCount(node->obj);
ad807e6f 4512 zfree(node->forward);
fd8ccf44 4513 zfree(node);
4514}
4515
4516static void zslFree(zskiplist *zsl) {
ad807e6f 4517 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 4518
ad807e6f 4519 zfree(zsl->header->forward);
4520 zfree(zsl->header);
fd8ccf44 4521 while(node) {
599379dd 4522 next = node->forward[0];
fd8ccf44 4523 zslFreeNode(node);
4524 node = next;
4525 }
ad807e6f 4526 zfree(zsl);
fd8ccf44 4527}
4528
6b47e12e 4529static int zslRandomLevel(void) {
4530 int level = 1;
4531 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4532 level += 1;
4533 return level;
4534}
4535
4536static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4537 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4538 int i, level;
4539
4540 x = zsl->header;
4541 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 4542 while (x->forward[i] &&
4543 (x->forward[i]->score < score ||
4544 (x->forward[i]->score == score &&
4545 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6b47e12e 4546 x = x->forward[i];
4547 update[i] = x;
4548 }
6b47e12e 4549 /* we assume the key is not already inside, since we allow duplicated
4550 * scores, and the re-insertion of score and redis object should never
4551 * happpen since the caller of zslInsert() should test in the hash table
4552 * if the element is already inside or not. */
4553 level = zslRandomLevel();
4554 if (level > zsl->level) {
4555 for (i = zsl->level; i < level; i++)
4556 update[i] = zsl->header;
4557 zsl->level = level;
4558 }
4559 x = zslCreateNode(level,score,obj);
4560 for (i = 0; i < level; i++) {
4561 x->forward[i] = update[i]->forward[i];
4562 update[i]->forward[i] = x;
4563 }
bb975144 4564 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 4565 if (x->forward[0])
4566 x->forward[0]->backward = x;
4567 else
4568 zsl->tail = x;
cc812361 4569 zsl->length++;
6b47e12e 4570}
4571
50c55df5 4572/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 4573static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 4574 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4575 int i;
4576
4577 x = zsl->header;
4578 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 4579 while (x->forward[i] &&
4580 (x->forward[i]->score < score ||
4581 (x->forward[i]->score == score &&
4582 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 4583 x = x->forward[i];
4584 update[i] = x;
4585 }
4586 /* We may have multiple elements with the same score, what we need
4587 * is to find the element with both the right score and object. */
4588 x = x->forward[0];
50c55df5 4589 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
9d60e6e4 4590 for (i = 0; i < zsl->level; i++) {
4591 if (update[i]->forward[i] != x) break;
4592 update[i]->forward[i] = x->forward[i];
4593 }
4594 if (x->forward[0]) {
4595 x->forward[0]->backward = (x->backward == zsl->header) ?
4596 NULL : x->backward;
e197b441 4597 } else {
9d60e6e4 4598 zsl->tail = x->backward;
e197b441 4599 }
9d60e6e4 4600 zslFreeNode(x);
4601 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4602 zsl->level--;
4603 zsl->length--;
4604 return 1;
4605 } else {
4606 return 0; /* not found */
e197b441 4607 }
4608 return 0; /* not found */
fd8ccf44 4609}
4610
1807985b 4611/* Delete all the elements with score between min and max from the skiplist.
4612 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
4613 * Note that this function takes the reference to the hash table view of the
4614 * sorted set, in order to remove the elements from the hash table too. */
4615static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
4616 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4617 unsigned long removed = 0;
4618 int i;
4619
4620 x = zsl->header;
4621 for (i = zsl->level-1; i >= 0; i--) {
4622 while (x->forward[i] && x->forward[i]->score < min)
4623 x = x->forward[i];
4624 update[i] = x;
4625 }
4626 /* We may have multiple elements with the same score, what we need
4627 * is to find the element with both the right score and object. */
4628 x = x->forward[0];
4629 while (x && x->score <= max) {
4630 zskiplistNode *next;
4631
4632 for (i = 0; i < zsl->level; i++) {
4633 if (update[i]->forward[i] != x) break;
4634 update[i]->forward[i] = x->forward[i];
4635 }
4636 if (x->forward[0]) {
4637 x->forward[0]->backward = (x->backward == zsl->header) ?
4638 NULL : x->backward;
4639 } else {
4640 zsl->tail = x->backward;
4641 }
4642 next = x->forward[0];
4643 dictDelete(dict,x->obj);
4644 zslFreeNode(x);
4645 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4646 zsl->level--;
4647 zsl->length--;
4648 removed++;
4649 x = next;
4650 }
4651 return removed; /* not found */
4652}
4653
50c55df5 4654/* Find the first node having a score equal or greater than the specified one.
4655 * Returns NULL if there is no match. */
4656static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
4657 zskiplistNode *x;
4658 int i;
4659
4660 x = zsl->header;
4661 for (i = zsl->level-1; i >= 0; i--) {
4662 while (x->forward[i] && x->forward[i]->score < score)
4663 x = x->forward[i];
4664 }
4665 /* We may have multiple elements with the same score, what we need
4666 * is to find the element with both the right score and object. */
4667 return x->forward[0];
4668}
4669
fd8ccf44 4670/* The actual Z-commands implementations */
4671
7db723ad 4672/* This generic command implements both ZADD and ZINCRBY.
e2665397 4673 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 4674 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 4675static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 4676 robj *zsetobj;
4677 zset *zs;
4678 double *score;
4679
e2665397 4680 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 4681 if (zsetobj == NULL) {
4682 zsetobj = createZsetObject();
e2665397 4683 dictAdd(c->db->dict,key,zsetobj);
4684 incrRefCount(key);
fd8ccf44 4685 } else {
4686 if (zsetobj->type != REDIS_ZSET) {
4687 addReply(c,shared.wrongtypeerr);
4688 return;
4689 }
4690 }
fd8ccf44 4691 zs = zsetobj->ptr;
e2665397 4692
7db723ad 4693 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 4694 * needs to handle the two different conditions. It's all about setting
4695 * '*score', that is, the new score to set, to the right value. */
4696 score = zmalloc(sizeof(double));
4697 if (doincrement) {
4698 dictEntry *de;
4699
4700 /* Read the old score. If the element was not present starts from 0 */
4701 de = dictFind(zs->dict,ele);
4702 if (de) {
4703 double *oldscore = dictGetEntryVal(de);
4704 *score = *oldscore + scoreval;
4705 } else {
4706 *score = scoreval;
4707 }
4708 } else {
4709 *score = scoreval;
4710 }
4711
4712 /* What follows is a simple remove and re-insert operation that is common
7db723ad 4713 * to both ZADD and ZINCRBY... */
e2665397 4714 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 4715 /* case 1: New element */
e2665397 4716 incrRefCount(ele); /* added to hash */
4717 zslInsert(zs->zsl,*score,ele);
4718 incrRefCount(ele); /* added to skiplist */
fd8ccf44 4719 server.dirty++;
e2665397 4720 if (doincrement)
e2665397 4721 addReplyDouble(c,*score);
91d71bfc 4722 else
4723 addReply(c,shared.cone);
fd8ccf44 4724 } else {
4725 dictEntry *de;
4726 double *oldscore;
4727
4728 /* case 2: Score update operation */
e2665397 4729 de = dictFind(zs->dict,ele);
dfc5e96c 4730 redisAssert(de != NULL);
fd8ccf44 4731 oldscore = dictGetEntryVal(de);
4732 if (*score != *oldscore) {
4733 int deleted;
4734
e2665397 4735 /* Remove and insert the element in the skip list with new score */
4736 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 4737 redisAssert(deleted != 0);
e2665397 4738 zslInsert(zs->zsl,*score,ele);
4739 incrRefCount(ele);
4740 /* Update the score in the hash table */
4741 dictReplace(zs->dict,ele,score);
fd8ccf44 4742 server.dirty++;
2161a965 4743 } else {
4744 zfree(score);
fd8ccf44 4745 }
e2665397 4746 if (doincrement)
4747 addReplyDouble(c,*score);
4748 else
4749 addReply(c,shared.czero);
fd8ccf44 4750 }
4751}
4752
e2665397 4753static void zaddCommand(redisClient *c) {
4754 double scoreval;
4755
4756 scoreval = strtod(c->argv[2]->ptr,NULL);
4757 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
4758}
4759
7db723ad 4760static void zincrbyCommand(redisClient *c) {
e2665397 4761 double scoreval;
4762
4763 scoreval = strtod(c->argv[2]->ptr,NULL);
4764 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
4765}
4766
1b7106e7 4767static void zremCommand(redisClient *c) {
4768 robj *zsetobj;
4769 zset *zs;
4770
4771 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
4772 if (zsetobj == NULL) {
4773 addReply(c,shared.czero);
4774 } else {
4775 dictEntry *de;
4776 double *oldscore;
4777 int deleted;
4778
4779 if (zsetobj->type != REDIS_ZSET) {
4780 addReply(c,shared.wrongtypeerr);
4781 return;
4782 }
4783 zs = zsetobj->ptr;
4784 de = dictFind(zs->dict,c->argv[2]);
4785 if (de == NULL) {
4786 addReply(c,shared.czero);
4787 return;
4788 }
4789 /* Delete from the skiplist */
4790 oldscore = dictGetEntryVal(de);
4791 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
dfc5e96c 4792 redisAssert(deleted != 0);
1b7106e7 4793
4794 /* Delete from the hash table */
4795 dictDelete(zs->dict,c->argv[2]);
4796 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
4797 server.dirty++;
4798 addReply(c,shared.cone);
4799 }
4800}
4801
1807985b 4802static void zremrangebyscoreCommand(redisClient *c) {
4803 double min = strtod(c->argv[2]->ptr,NULL);
4804 double max = strtod(c->argv[3]->ptr,NULL);
4805 robj *zsetobj;
4806 zset *zs;
4807
4808 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
4809 if (zsetobj == NULL) {
4810 addReply(c,shared.czero);
4811 } else {
4812 long deleted;
4813
4814 if (zsetobj->type != REDIS_ZSET) {
4815 addReply(c,shared.wrongtypeerr);
4816 return;
4817 }
4818 zs = zsetobj->ptr;
4819 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
4820 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
4821 server.dirty += deleted;
4822 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
4823 }
4824}
4825
e3870fab 4826static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 4827 robj *o;
4828 int start = atoi(c->argv[2]->ptr);
4829 int end = atoi(c->argv[3]->ptr);
752da584 4830 int withscores = 0;
4831
4832 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
4833 withscores = 1;
4834 } else if (c->argc >= 5) {
4835 addReply(c,shared.syntaxerr);
4836 return;
4837 }
cc812361 4838
4839 o = lookupKeyRead(c->db,c->argv[1]);
4840 if (o == NULL) {
4841 addReply(c,shared.nullmultibulk);
4842 } else {
4843 if (o->type != REDIS_ZSET) {
4844 addReply(c,shared.wrongtypeerr);
4845 } else {
4846 zset *zsetobj = o->ptr;
4847 zskiplist *zsl = zsetobj->zsl;
4848 zskiplistNode *ln;
4849
4850 int llen = zsl->length;
4851 int rangelen, j;
4852 robj *ele;
4853
4854 /* convert negative indexes */
4855 if (start < 0) start = llen+start;
4856 if (end < 0) end = llen+end;
4857 if (start < 0) start = 0;
4858 if (end < 0) end = 0;
4859
4860 /* indexes sanity checks */
4861 if (start > end || start >= llen) {
4862 /* Out of range start or start > end result in empty list */
4863 addReply(c,shared.emptymultibulk);
4864 return;
4865 }
4866 if (end >= llen) end = llen-1;
4867 rangelen = (end-start)+1;
4868
4869 /* Return the result in form of a multi-bulk reply */
e3870fab 4870 if (reverse) {
4871 ln = zsl->tail;
4872 while (start--)
4873 ln = ln->backward;
4874 } else {
4875 ln = zsl->header->forward[0];
4876 while (start--)
4877 ln = ln->forward[0];
4878 }
cc812361 4879
752da584 4880 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
4881 withscores ? (rangelen*2) : rangelen));
cc812361 4882 for (j = 0; j < rangelen; j++) {
0aad7a19 4883 ele = ln->obj;
cc812361 4884 addReplyBulkLen(c,ele);
4885 addReply(c,ele);
4886 addReply(c,shared.crlf);
752da584 4887 if (withscores)
4888 addReplyDouble(c,ln->score);
e3870fab 4889 ln = reverse ? ln->backward : ln->forward[0];
cc812361 4890 }
4891 }
4892 }
4893}
4894
e3870fab 4895static void zrangeCommand(redisClient *c) {
4896 zrangeGenericCommand(c,0);
4897}
4898
4899static void zrevrangeCommand(redisClient *c) {
4900 zrangeGenericCommand(c,1);
4901}
4902
50c55df5 4903static void zrangebyscoreCommand(redisClient *c) {
4904 robj *o;
4905 double min = strtod(c->argv[2]->ptr,NULL);
4906 double max = strtod(c->argv[3]->ptr,NULL);
80181f78 4907 int offset = 0, limit = -1;
4908
4909 if (c->argc != 4 && c->argc != 7) {
454d4e43 4910 addReplySds(c,
4911 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 4912 return;
4913 } else if (c->argc == 7 && strcasecmp(c->argv[4]->ptr,"limit")) {
4914 addReply(c,shared.syntaxerr);
4915 return;
4916 } else if (c->argc == 7) {
4917 offset = atoi(c->argv[5]->ptr);
4918 limit = atoi(c->argv[6]->ptr);
0b13687c 4919 if (offset < 0) offset = 0;
80181f78 4920 }
50c55df5 4921
4922 o = lookupKeyRead(c->db,c->argv[1]);
4923 if (o == NULL) {
4924 addReply(c,shared.nullmultibulk);
4925 } else {
4926 if (o->type != REDIS_ZSET) {
4927 addReply(c,shared.wrongtypeerr);
4928 } else {
4929 zset *zsetobj = o->ptr;
4930 zskiplist *zsl = zsetobj->zsl;
4931 zskiplistNode *ln;
4932 robj *ele, *lenobj;
4933 unsigned int rangelen = 0;
4934
4935 /* Get the first node with the score >= min */
4936 ln = zslFirstWithScore(zsl,min);
4937 if (ln == NULL) {
4938 /* No element matching the speciifed interval */
4939 addReply(c,shared.emptymultibulk);
4940 return;
4941 }
4942
4943 /* We don't know in advance how many matching elements there
4944 * are in the list, so we push this object that will represent
4945 * the multi-bulk length in the output buffer, and will "fix"
4946 * it later */
4947 lenobj = createObject(REDIS_STRING,NULL);
4948 addReply(c,lenobj);
c74e7c77 4949 decrRefCount(lenobj);
50c55df5 4950
dbbc7285 4951 while(ln && ln->score <= max) {
80181f78 4952 if (offset) {
4953 offset--;
4954 ln = ln->forward[0];
4955 continue;
4956 }
4957 if (limit == 0) break;
50c55df5 4958 ele = ln->obj;
4959 addReplyBulkLen(c,ele);
4960 addReply(c,ele);
4961 addReply(c,shared.crlf);
4962 ln = ln->forward[0];
4963 rangelen++;
80181f78 4964 if (limit > 0) limit--;
50c55df5 4965 }
4966 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",rangelen);
4967 }
4968 }
4969}
4970
3c41331e 4971static void zcardCommand(redisClient *c) {
e197b441 4972 robj *o;
4973 zset *zs;
4974
4975 o = lookupKeyRead(c->db,c->argv[1]);
4976 if (o == NULL) {
4977 addReply(c,shared.czero);
4978 return;
4979 } else {
4980 if (o->type != REDIS_ZSET) {
4981 addReply(c,shared.wrongtypeerr);
4982 } else {
4983 zs = o->ptr;
682ac724 4984 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
e197b441 4985 }
4986 }
4987}
4988
6e333bbe 4989static void zscoreCommand(redisClient *c) {
4990 robj *o;
4991 zset *zs;
4992
4993 o = lookupKeyRead(c->db,c->argv[1]);
4994 if (o == NULL) {
96d8b4ee 4995 addReply(c,shared.nullbulk);
6e333bbe 4996 return;
4997 } else {
4998 if (o->type != REDIS_ZSET) {
4999 addReply(c,shared.wrongtypeerr);
5000 } else {
5001 dictEntry *de;
5002
5003 zs = o->ptr;
5004 de = dictFind(zs->dict,c->argv[2]);
5005 if (!de) {
5006 addReply(c,shared.nullbulk);
5007 } else {
6e333bbe 5008 double *score = dictGetEntryVal(de);
5009
e2665397 5010 addReplyDouble(c,*score);
6e333bbe 5011 }
5012 }
5013 }
5014}
5015
6b47e12e 5016/* ========================= Non type-specific commands ==================== */
5017
ed9b544e 5018static void flushdbCommand(redisClient *c) {
ca37e9cd 5019 server.dirty += dictSize(c->db->dict);
3305306f 5020 dictEmpty(c->db->dict);
5021 dictEmpty(c->db->expires);
ed9b544e 5022 addReply(c,shared.ok);
ed9b544e 5023}
5024
5025static void flushallCommand(redisClient *c) {
ca37e9cd 5026 server.dirty += emptyDb();
ed9b544e 5027 addReply(c,shared.ok);
f78fd11b 5028 rdbSave(server.dbfilename);
ca37e9cd 5029 server.dirty++;
ed9b544e 5030}
5031
56906eef 5032static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 5033 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 5034 so->type = type;
5035 so->pattern = pattern;
5036 return so;
5037}
5038
5039/* Return the value associated to the key with a name obtained
5040 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 5041static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 5042 char *p;
5043 sds spat, ssub;
5044 robj keyobj;
5045 int prefixlen, sublen, postfixlen;
ed9b544e 5046 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5047 struct {
f1017b3f 5048 long len;
5049 long free;
ed9b544e 5050 char buf[REDIS_SORTKEY_MAX+1];
5051 } keyname;
5052
28173a49 5053 /* If the pattern is "#" return the substitution object itself in order
5054 * to implement the "SORT ... GET #" feature. */
5055 spat = pattern->ptr;
5056 if (spat[0] == '#' && spat[1] == '\0') {
5057 return subst;
5058 }
5059
5060 /* The substitution object may be specially encoded. If so we create
9d65a1bb 5061 * a decoded object on the fly. Otherwise getDecodedObject will just
5062 * increment the ref count, that we'll decrement later. */
5063 subst = getDecodedObject(subst);
942a3961 5064
ed9b544e 5065 ssub = subst->ptr;
5066 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5067 p = strchr(spat,'*');
ed5a857a 5068 if (!p) {
5069 decrRefCount(subst);
5070 return NULL;
5071 }
ed9b544e 5072
5073 prefixlen = p-spat;
5074 sublen = sdslen(ssub);
5075 postfixlen = sdslen(spat)-(prefixlen+1);
5076 memcpy(keyname.buf,spat,prefixlen);
5077 memcpy(keyname.buf+prefixlen,ssub,sublen);
5078 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5079 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5080 keyname.len = prefixlen+sublen+postfixlen;
5081
dfc5e96c 5082 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 5083 decrRefCount(subst);
5084
a4d1ba9a 5085 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 5086 return lookupKeyRead(db,&keyobj);
ed9b544e 5087}
5088
5089/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5090 * the additional parameter is not standard but a BSD-specific we have to
5091 * pass sorting parameters via the global 'server' structure */
5092static int sortCompare(const void *s1, const void *s2) {
5093 const redisSortObject *so1 = s1, *so2 = s2;
5094 int cmp;
5095
5096 if (!server.sort_alpha) {
5097 /* Numeric sorting. Here it's trivial as we precomputed scores */
5098 if (so1->u.score > so2->u.score) {
5099 cmp = 1;
5100 } else if (so1->u.score < so2->u.score) {
5101 cmp = -1;
5102 } else {
5103 cmp = 0;
5104 }
5105 } else {
5106 /* Alphanumeric sorting */
5107 if (server.sort_bypattern) {
5108 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5109 /* At least one compare object is NULL */
5110 if (so1->u.cmpobj == so2->u.cmpobj)
5111 cmp = 0;
5112 else if (so1->u.cmpobj == NULL)
5113 cmp = -1;
5114 else
5115 cmp = 1;
5116 } else {
5117 /* We have both the objects, use strcoll */
5118 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5119 }
5120 } else {
5121 /* Compare elements directly */
9d65a1bb 5122 robj *dec1, *dec2;
5123
5124 dec1 = getDecodedObject(so1->obj);
5125 dec2 = getDecodedObject(so2->obj);
5126 cmp = strcoll(dec1->ptr,dec2->ptr);
5127 decrRefCount(dec1);
5128 decrRefCount(dec2);
ed9b544e 5129 }
5130 }
5131 return server.sort_desc ? -cmp : cmp;
5132}
5133
5134/* The SORT command is the most complex command in Redis. Warning: this code
5135 * is optimized for speed and a bit less for readability */
5136static void sortCommand(redisClient *c) {
ed9b544e 5137 list *operations;
5138 int outputlen = 0;
5139 int desc = 0, alpha = 0;
5140 int limit_start = 0, limit_count = -1, start, end;
5141 int j, dontsort = 0, vectorlen;
5142 int getop = 0; /* GET operation counter */
443c6409 5143 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 5144 redisSortObject *vector; /* Resulting vector to sort */
5145
5146 /* Lookup the key to sort. It must be of the right types */
3305306f 5147 sortval = lookupKeyRead(c->db,c->argv[1]);
5148 if (sortval == NULL) {
d922ae65 5149 addReply(c,shared.nullmultibulk);
ed9b544e 5150 return;
5151 }
a5eb649b 5152 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5153 sortval->type != REDIS_ZSET)
5154 {
c937aa89 5155 addReply(c,shared.wrongtypeerr);
ed9b544e 5156 return;
5157 }
5158
5159 /* Create a list of operations to perform for every sorted element.
5160 * Operations can be GET/DEL/INCR/DECR */
5161 operations = listCreate();
092dac2a 5162 listSetFreeMethod(operations,zfree);
ed9b544e 5163 j = 2;
5164
5165 /* Now we need to protect sortval incrementing its count, in the future
5166 * SORT may have options able to overwrite/delete keys during the sorting
5167 * and the sorted key itself may get destroied */
5168 incrRefCount(sortval);
5169
5170 /* The SORT command has an SQL-alike syntax, parse it */
5171 while(j < c->argc) {
5172 int leftargs = c->argc-j-1;
5173 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5174 desc = 0;
5175 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5176 desc = 1;
5177 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5178 alpha = 1;
5179 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5180 limit_start = atoi(c->argv[j+1]->ptr);
5181 limit_count = atoi(c->argv[j+2]->ptr);
5182 j+=2;
443c6409 5183 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5184 storekey = c->argv[j+1];
5185 j++;
ed9b544e 5186 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5187 sortby = c->argv[j+1];
5188 /* If the BY pattern does not contain '*', i.e. it is constant,
5189 * we don't need to sort nor to lookup the weight keys. */
5190 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5191 j++;
5192 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5193 listAddNodeTail(operations,createSortOperation(
5194 REDIS_SORT_GET,c->argv[j+1]));
5195 getop++;
5196 j++;
ed9b544e 5197 } else {
5198 decrRefCount(sortval);
5199 listRelease(operations);
c937aa89 5200 addReply(c,shared.syntaxerr);
ed9b544e 5201 return;
5202 }
5203 j++;
5204 }
5205
5206 /* Load the sorting vector with all the objects to sort */
a5eb649b 5207 switch(sortval->type) {
5208 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5209 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5210 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 5211 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 5212 }
ed9b544e 5213 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 5214 j = 0;
a5eb649b 5215
ed9b544e 5216 if (sortval->type == REDIS_LIST) {
5217 list *list = sortval->ptr;
6208b3a7 5218 listNode *ln;
5219
5220 listRewind(list);
5221 while((ln = listYield(list))) {
ed9b544e 5222 robj *ele = ln->value;
5223 vector[j].obj = ele;
5224 vector[j].u.score = 0;
5225 vector[j].u.cmpobj = NULL;
ed9b544e 5226 j++;
5227 }
5228 } else {
a5eb649b 5229 dict *set;
ed9b544e 5230 dictIterator *di;
5231 dictEntry *setele;
5232
a5eb649b 5233 if (sortval->type == REDIS_SET) {
5234 set = sortval->ptr;
5235 } else {
5236 zset *zs = sortval->ptr;
5237 set = zs->dict;
5238 }
5239
ed9b544e 5240 di = dictGetIterator(set);
ed9b544e 5241 while((setele = dictNext(di)) != NULL) {
5242 vector[j].obj = dictGetEntryKey(setele);
5243 vector[j].u.score = 0;
5244 vector[j].u.cmpobj = NULL;
5245 j++;
5246 }
5247 dictReleaseIterator(di);
5248 }
dfc5e96c 5249 redisAssert(j == vectorlen);
ed9b544e 5250
5251 /* Now it's time to load the right scores in the sorting vector */
5252 if (dontsort == 0) {
5253 for (j = 0; j < vectorlen; j++) {
5254 if (sortby) {
5255 robj *byval;
5256
3305306f 5257 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 5258 if (!byval || byval->type != REDIS_STRING) continue;
5259 if (alpha) {
9d65a1bb 5260 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 5261 } else {
942a3961 5262 if (byval->encoding == REDIS_ENCODING_RAW) {
5263 vector[j].u.score = strtod(byval->ptr,NULL);
5264 } else {
9d65a1bb 5265 /* Don't need to decode the object if it's
5266 * integer-encoded (the only encoding supported) so
5267 * far. We can just cast it */
f1017b3f 5268 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 5269 vector[j].u.score = (long)byval->ptr;
f1017b3f 5270 } else
dfc5e96c 5271 redisAssert(1 != 1);
942a3961 5272 }
ed9b544e 5273 }
5274 } else {
942a3961 5275 if (!alpha) {
5276 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
5277 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
5278 else {
5279 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
5280 vector[j].u.score = (long) vector[j].obj->ptr;
5281 else
dfc5e96c 5282 redisAssert(1 != 1);
942a3961 5283 }
5284 }
ed9b544e 5285 }
5286 }
5287 }
5288
5289 /* We are ready to sort the vector... perform a bit of sanity check
5290 * on the LIMIT option too. We'll use a partial version of quicksort. */
5291 start = (limit_start < 0) ? 0 : limit_start;
5292 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
5293 if (start >= vectorlen) {
5294 start = vectorlen-1;
5295 end = vectorlen-2;
5296 }
5297 if (end >= vectorlen) end = vectorlen-1;
5298
5299 if (dontsort == 0) {
5300 server.sort_desc = desc;
5301 server.sort_alpha = alpha;
5302 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 5303 if (sortby && (start != 0 || end != vectorlen-1))
5304 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
5305 else
5306 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 5307 }
5308
5309 /* Send command output to the output buffer, performing the specified
5310 * GET/DEL/INCR/DECR operations if any. */
5311 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 5312 if (storekey == NULL) {
5313 /* STORE option not specified, sent the sorting result to client */
5314 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
5315 for (j = start; j <= end; j++) {
5316 listNode *ln;
5317 if (!getop) {
5318 addReplyBulkLen(c,vector[j].obj);
5319 addReply(c,vector[j].obj);
5320 addReply(c,shared.crlf);
5321 }
5322 listRewind(operations);
5323 while((ln = listYield(operations))) {
5324 redisSortOperation *sop = ln->value;
5325 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5326 vector[j].obj);
5327
5328 if (sop->type == REDIS_SORT_GET) {
5329 if (!val || val->type != REDIS_STRING) {
5330 addReply(c,shared.nullbulk);
5331 } else {
5332 addReplyBulkLen(c,val);
5333 addReply(c,val);
5334 addReply(c,shared.crlf);
5335 }
5336 } else {
dfc5e96c 5337 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 5338 }
5339 }
ed9b544e 5340 }
443c6409 5341 } else {
5342 robj *listObject = createListObject();
5343 list *listPtr = (list*) listObject->ptr;
5344
5345 /* STORE option specified, set the sorting result as a List object */
5346 for (j = start; j <= end; j++) {
5347 listNode *ln;
5348 if (!getop) {
5349 listAddNodeTail(listPtr,vector[j].obj);
5350 incrRefCount(vector[j].obj);
5351 }
5352 listRewind(operations);
5353 while((ln = listYield(operations))) {
5354 redisSortOperation *sop = ln->value;
5355 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5356 vector[j].obj);
5357
5358 if (sop->type == REDIS_SORT_GET) {
5359 if (!val || val->type != REDIS_STRING) {
5360 listAddNodeTail(listPtr,createStringObject("",0));
5361 } else {
5362 listAddNodeTail(listPtr,val);
5363 incrRefCount(val);
5364 }
ed9b544e 5365 } else {
dfc5e96c 5366 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 5367 }
ed9b544e 5368 }
ed9b544e 5369 }
121796f7 5370 if (dictReplace(c->db->dict,storekey,listObject)) {
5371 incrRefCount(storekey);
5372 }
443c6409 5373 /* Note: we add 1 because the DB is dirty anyway since even if the
5374 * SORT result is empty a new key is set and maybe the old content
5375 * replaced. */
5376 server.dirty += 1+outputlen;
5377 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 5378 }
5379
5380 /* Cleanup */
5381 decrRefCount(sortval);
5382 listRelease(operations);
5383 for (j = 0; j < vectorlen; j++) {
5384 if (sortby && alpha && vector[j].u.cmpobj)
5385 decrRefCount(vector[j].u.cmpobj);
5386 }
5387 zfree(vector);
5388}
5389
1c85b79f 5390/* Create the string returned by the INFO command. This is decoupled
5391 * by the INFO command itself as we need to report the same information
5392 * on memory corruption problems. */
5393static sds genRedisInfoString(void) {
ed9b544e 5394 sds info;
5395 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 5396 int j;
ed9b544e 5397
5398 info = sdscatprintf(sdsempty(),
5399 "redis_version:%s\r\n"
f1017b3f 5400 "arch_bits:%s\r\n"
7a932b74 5401 "multiplexing_api:%s\r\n"
682ac724 5402 "uptime_in_seconds:%ld\r\n"
5403 "uptime_in_days:%ld\r\n"
ed9b544e 5404 "connected_clients:%d\r\n"
5405 "connected_slaves:%d\r\n"
f86a74e9 5406 "blocked_clients:%d\r\n"
5fba9f71 5407 "used_memory:%zu\r\n"
ed9b544e 5408 "changes_since_last_save:%lld\r\n"
be2bb6b0 5409 "bgsave_in_progress:%d\r\n"
682ac724 5410 "last_save_time:%ld\r\n"
b3fad521 5411 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 5412 "total_connections_received:%lld\r\n"
5413 "total_commands_processed:%lld\r\n"
a0f643ea 5414 "role:%s\r\n"
ed9b544e 5415 ,REDIS_VERSION,
f1017b3f 5416 (sizeof(long) == 8) ? "64" : "32",
7a932b74 5417 aeGetApiName(),
a0f643ea 5418 uptime,
5419 uptime/(3600*24),
ed9b544e 5420 listLength(server.clients)-listLength(server.slaves),
5421 listLength(server.slaves),
f86a74e9 5422 server.blockedclients,
ed9b544e 5423 server.usedmemory,
5424 server.dirty,
9d65a1bb 5425 server.bgsavechildpid != -1,
ed9b544e 5426 server.lastsave,
b3fad521 5427 server.bgrewritechildpid != -1,
ed9b544e 5428 server.stat_numconnections,
5429 server.stat_numcommands,
a0f643ea 5430 server.masterhost == NULL ? "master" : "slave"
ed9b544e 5431 );
a0f643ea 5432 if (server.masterhost) {
5433 info = sdscatprintf(info,
5434 "master_host:%s\r\n"
5435 "master_port:%d\r\n"
5436 "master_link_status:%s\r\n"
5437 "master_last_io_seconds_ago:%d\r\n"
5438 ,server.masterhost,
5439 server.masterport,
5440 (server.replstate == REDIS_REPL_CONNECTED) ?
5441 "up" : "down",
f72b934d 5442 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 5443 );
5444 }
c3cb078d 5445 for (j = 0; j < server.dbnum; j++) {
5446 long long keys, vkeys;
5447
5448 keys = dictSize(server.db[j].dict);
5449 vkeys = dictSize(server.db[j].expires);
5450 if (keys || vkeys) {
9d65a1bb 5451 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 5452 j, keys, vkeys);
5453 }
5454 }
1c85b79f 5455 return info;
5456}
5457
5458static void infoCommand(redisClient *c) {
5459 sds info = genRedisInfoString();
83c6a618 5460 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
5461 (unsigned long)sdslen(info)));
ed9b544e 5462 addReplySds(c,info);
70003d28 5463 addReply(c,shared.crlf);
ed9b544e 5464}
5465
3305306f 5466static void monitorCommand(redisClient *c) {
5467 /* ignore MONITOR if aleady slave or in monitor mode */
5468 if (c->flags & REDIS_SLAVE) return;
5469
5470 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
5471 c->slaveseldb = 0;
6b47e12e 5472 listAddNodeTail(server.monitors,c);
3305306f 5473 addReply(c,shared.ok);
5474}
5475
5476/* ================================= Expire ================================= */
5477static int removeExpire(redisDb *db, robj *key) {
5478 if (dictDelete(db->expires,key) == DICT_OK) {
5479 return 1;
5480 } else {
5481 return 0;
5482 }
5483}
5484
5485static int setExpire(redisDb *db, robj *key, time_t when) {
5486 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
5487 return 0;
5488 } else {
5489 incrRefCount(key);
5490 return 1;
5491 }
5492}
5493
bb32ede5 5494/* Return the expire time of the specified key, or -1 if no expire
5495 * is associated with this key (i.e. the key is non volatile) */
5496static time_t getExpire(redisDb *db, robj *key) {
5497 dictEntry *de;
5498
5499 /* No expire? return ASAP */
5500 if (dictSize(db->expires) == 0 ||
5501 (de = dictFind(db->expires,key)) == NULL) return -1;
5502
5503 return (time_t) dictGetEntryVal(de);
5504}
5505
3305306f 5506static int expireIfNeeded(redisDb *db, robj *key) {
5507 time_t when;
5508 dictEntry *de;
5509
5510 /* No expire? return ASAP */
5511 if (dictSize(db->expires) == 0 ||
5512 (de = dictFind(db->expires,key)) == NULL) return 0;
5513
5514 /* Lookup the expire */
5515 when = (time_t) dictGetEntryVal(de);
5516 if (time(NULL) <= when) return 0;
5517
5518 /* Delete the key */
5519 dictDelete(db->expires,key);
5520 return dictDelete(db->dict,key) == DICT_OK;
5521}
5522
5523static int deleteIfVolatile(redisDb *db, robj *key) {
5524 dictEntry *de;
5525
5526 /* No expire? return ASAP */
5527 if (dictSize(db->expires) == 0 ||
5528 (de = dictFind(db->expires,key)) == NULL) return 0;
5529
5530 /* Delete the key */
0c66a471 5531 server.dirty++;
3305306f 5532 dictDelete(db->expires,key);
5533 return dictDelete(db->dict,key) == DICT_OK;
5534}
5535
802e8373 5536static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
3305306f 5537 dictEntry *de;
3305306f 5538
802e8373 5539 de = dictFind(c->db->dict,key);
3305306f 5540 if (de == NULL) {
5541 addReply(c,shared.czero);
5542 return;
5543 }
43e5ccdf 5544 if (seconds < 0) {
5545 if (deleteKey(c->db,key)) server.dirty++;
5546 addReply(c, shared.cone);
3305306f 5547 return;
5548 } else {
5549 time_t when = time(NULL)+seconds;
802e8373 5550 if (setExpire(c->db,key,when)) {
3305306f 5551 addReply(c,shared.cone);
77423026 5552 server.dirty++;
5553 } else {
3305306f 5554 addReply(c,shared.czero);
77423026 5555 }
3305306f 5556 return;
5557 }
5558}
5559
802e8373 5560static void expireCommand(redisClient *c) {
5561 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
5562}
5563
5564static void expireatCommand(redisClient *c) {
5565 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
5566}
5567
fd88489a 5568static void ttlCommand(redisClient *c) {
5569 time_t expire;
5570 int ttl = -1;
5571
5572 expire = getExpire(c->db,c->argv[1]);
5573 if (expire != -1) {
5574 ttl = (int) (expire-time(NULL));
5575 if (ttl < 0) ttl = -1;
5576 }
5577 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
5578}
5579
6e469882 5580/* ================================ MULTI/EXEC ============================== */
5581
5582/* Client state initialization for MULTI/EXEC */
5583static void initClientMultiState(redisClient *c) {
5584 c->mstate.commands = NULL;
5585 c->mstate.count = 0;
5586}
5587
5588/* Release all the resources associated with MULTI/EXEC state */
5589static void freeClientMultiState(redisClient *c) {
5590 int j;
5591
5592 for (j = 0; j < c->mstate.count; j++) {
5593 int i;
5594 multiCmd *mc = c->mstate.commands+j;
5595
5596 for (i = 0; i < mc->argc; i++)
5597 decrRefCount(mc->argv[i]);
5598 zfree(mc->argv);
5599 }
5600 zfree(c->mstate.commands);
5601}
5602
5603/* Add a new command into the MULTI commands queue */
5604static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
5605 multiCmd *mc;
5606 int j;
5607
5608 c->mstate.commands = zrealloc(c->mstate.commands,
5609 sizeof(multiCmd)*(c->mstate.count+1));
5610 mc = c->mstate.commands+c->mstate.count;
5611 mc->cmd = cmd;
5612 mc->argc = c->argc;
5613 mc->argv = zmalloc(sizeof(robj*)*c->argc);
5614 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
5615 for (j = 0; j < c->argc; j++)
5616 incrRefCount(mc->argv[j]);
5617 c->mstate.count++;
5618}
5619
5620static void multiCommand(redisClient *c) {
5621 c->flags |= REDIS_MULTI;
36c548f0 5622 addReply(c,shared.ok);
6e469882 5623}
5624
5625static void execCommand(redisClient *c) {
5626 int j;
5627 robj **orig_argv;
5628 int orig_argc;
5629
5630 if (!(c->flags & REDIS_MULTI)) {
5631 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
5632 return;
5633 }
5634
5635 orig_argv = c->argv;
5636 orig_argc = c->argc;
5637 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
5638 for (j = 0; j < c->mstate.count; j++) {
5639 c->argc = c->mstate.commands[j].argc;
5640 c->argv = c->mstate.commands[j].argv;
5641 call(c,c->mstate.commands[j].cmd);
5642 }
5643 c->argv = orig_argv;
5644 c->argc = orig_argc;
5645 freeClientMultiState(c);
5646 initClientMultiState(c);
5647 c->flags &= (~REDIS_MULTI);
5648}
5649
4409877e 5650/* =========================== Blocking Operations ========================= */
5651
5652/* Currently Redis blocking operations support is limited to list POP ops,
5653 * so the current implementation is not fully generic, but it is also not
5654 * completely specific so it will not require a rewrite to support new
5655 * kind of blocking operations in the future.
5656 *
5657 * Still it's important to note that list blocking operations can be already
5658 * used as a notification mechanism in order to implement other blocking
5659 * operations at application level, so there must be a very strong evidence
5660 * of usefulness and generality before new blocking operations are implemented.
5661 *
5662 * This is how the current blocking POP works, we use BLPOP as example:
5663 * - If the user calls BLPOP and the key exists and contains a non empty list
5664 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
5665 * if there is not to block.
5666 * - If instead BLPOP is called and the key does not exists or the list is
5667 * empty we need to block. In order to do so we remove the notification for
5668 * new data to read in the client socket (so that we'll not serve new
5669 * requests if the blocking request is not served). Also we put the client
95242ab5 5670 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 5671 * blocking for this keys.
5672 * - If a PUSH operation against a key with blocked clients waiting is
5673 * performed, we serve the first in the list: basically instead to push
5674 * the new element inside the list we return it to the (first / oldest)
5675 * blocking client, unblock the client, and remove it form the list.
5676 *
5677 * The above comment and the source code should be enough in order to understand
5678 * the implementation and modify / fix it later.
5679 */
5680
5681/* Set a client in blocking mode for the specified key, with the specified
5682 * timeout */
b177fd30 5683static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 5684 dictEntry *de;
5685 list *l;
b177fd30 5686 int j;
4409877e 5687
b177fd30 5688 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
5689 c->blockingkeysnum = numkeys;
4409877e 5690 c->blockingto = timeout;
b177fd30 5691 for (j = 0; j < numkeys; j++) {
5692 /* Add the key in the client structure, to map clients -> keys */
5693 c->blockingkeys[j] = keys[j];
5694 incrRefCount(keys[j]);
4409877e 5695
b177fd30 5696 /* And in the other "side", to map keys -> clients */
5697 de = dictFind(c->db->blockingkeys,keys[j]);
5698 if (de == NULL) {
5699 int retval;
5700
5701 /* For every key we take a list of clients blocked for it */
5702 l = listCreate();
5703 retval = dictAdd(c->db->blockingkeys,keys[j],l);
5704 incrRefCount(keys[j]);
5705 assert(retval == DICT_OK);
5706 } else {
5707 l = dictGetEntryVal(de);
5708 }
5709 listAddNodeTail(l,c);
4409877e 5710 }
b177fd30 5711 /* Mark the client as a blocked client */
4409877e 5712 c->flags |= REDIS_BLOCKED;
5713 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
f86a74e9 5714 server.blockedclients++;
4409877e 5715}
5716
5717/* Unblock a client that's waiting in a blocking operation such as BLPOP */
5718static void unblockClient(redisClient *c) {
5719 dictEntry *de;
5720 list *l;
b177fd30 5721 int j;
4409877e 5722
b177fd30 5723 assert(c->blockingkeys != NULL);
5724 /* The client may wait for multiple keys, so unblock it for every key. */
5725 for (j = 0; j < c->blockingkeysnum; j++) {
5726 /* Remove this client from the list of clients waiting for this key. */
5727 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
5728 assert(de != NULL);
5729 l = dictGetEntryVal(de);
5730 listDelNode(l,listSearchKey(l,c));
5731 /* If the list is empty we need to remove it to avoid wasting memory */
5732 if (listLength(l) == 0)
5733 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
5734 decrRefCount(c->blockingkeys[j]);
5735 }
5736 /* Cleanup the client structure */
5737 zfree(c->blockingkeys);
5738 c->blockingkeys = NULL;
4409877e 5739 c->flags &= (~REDIS_BLOCKED);
f86a74e9 5740 server.blockedclients--;
4409877e 5741 /* Ok now we are ready to get read events from socket, note that we
5742 * can't trap errors here as it's possible that unblockClients() is
5743 * called from freeClient() itself, and the only thing we can do
5744 * if we failed to register the READABLE event is to kill the client.
5745 * Still the following function should never fail in the real world as
5746 * we are sure the file descriptor is sane, and we exit on out of mem. */
5747 aeCreateFileEvent(server.el, c->fd, AE_READABLE, readQueryFromClient, c);
5748 /* As a final step we want to process data if there is some command waiting
5749 * in the input buffer. Note that this is safe even if unblockClient()
5750 * gets called from freeClient() because freeClient() will be smart
5751 * enough to call this function *after* c->querybuf was set to NULL. */
5752 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
5753}
5754
5755/* This should be called from any function PUSHing into lists.
5756 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
5757 * 'ele' is the element pushed.
5758 *
5759 * If the function returns 0 there was no client waiting for a list push
5760 * against this key.
5761 *
5762 * If the function returns 1 there was a client waiting for a list push
5763 * against this key, the element was passed to this client thus it's not
5764 * needed to actually add it to the list and the caller should return asap. */
5765static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
5766 struct dictEntry *de;
5767 redisClient *receiver;
5768 list *l;
5769 listNode *ln;
5770
5771 de = dictFind(c->db->blockingkeys,key);
5772 if (de == NULL) return 0;
5773 l = dictGetEntryVal(de);
5774 ln = listFirst(l);
5775 assert(ln != NULL);
5776 receiver = ln->value;
4409877e 5777
b177fd30 5778 addReplySds(receiver,sdsnew("*2\r\n"));
5779 addReplyBulkLen(receiver,key);
5780 addReply(receiver,key);
5781 addReply(receiver,shared.crlf);
4409877e 5782 addReplyBulkLen(receiver,ele);
5783 addReply(receiver,ele);
5784 addReply(receiver,shared.crlf);
5785 unblockClient(receiver);
5786 return 1;
5787}
5788
5789/* Blocking RPOP/LPOP */
5790static void blockingPopGenericCommand(redisClient *c, int where) {
5791 robj *o;
5792 time_t timeout;
b177fd30 5793 int j;
4409877e 5794
b177fd30 5795 for (j = 1; j < c->argc-1; j++) {
5796 o = lookupKeyWrite(c->db,c->argv[j]);
5797 if (o != NULL) {
5798 if (o->type != REDIS_LIST) {
5799 addReply(c,shared.wrongtypeerr);
4409877e 5800 return;
b177fd30 5801 } else {
5802 list *list = o->ptr;
5803 if (listLength(list) != 0) {
5804 /* If the list contains elements fall back to the usual
5805 * non-blocking POP operation */
5806 robj *argv[2], **orig_argv;
5807 int orig_argc;
5808
5809 /* We need to alter the command arguments before to call
5810 * popGenericCommand() as the command takes a single key. */
5811 orig_argv = c->argv;
5812 orig_argc = c->argc;
5813 argv[1] = c->argv[j];
5814 c->argv = argv;
5815 c->argc = 2;
5816
5817 /* Also the return value is different, we need to output
5818 * the multi bulk reply header and the key name. The
5819 * "real" command will add the last element (the value)
5820 * for us. If this souds like an hack to you it's just
5821 * because it is... */
5822 addReplySds(c,sdsnew("*2\r\n"));
5823 addReplyBulkLen(c,argv[1]);
5824 addReply(c,argv[1]);
5825 addReply(c,shared.crlf);
5826 popGenericCommand(c,where);
5827
5828 /* Fix the client structure with the original stuff */
5829 c->argv = orig_argv;
5830 c->argc = orig_argc;
5831 return;
5832 }
4409877e 5833 }
5834 }
5835 }
5836 /* If the list is empty or the key does not exists we must block */
b177fd30 5837 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 5838 if (timeout > 0) timeout += time(NULL);
b177fd30 5839 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 5840}
5841
5842static void blpopCommand(redisClient *c) {
5843 blockingPopGenericCommand(c,REDIS_HEAD);
5844}
5845
5846static void brpopCommand(redisClient *c) {
5847 blockingPopGenericCommand(c,REDIS_TAIL);
5848}
5849
ed9b544e 5850/* =============================== Replication ============================= */
5851
a4d1ba9a 5852static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 5853 ssize_t nwritten, ret = size;
5854 time_t start = time(NULL);
5855
5856 timeout++;
5857 while(size) {
5858 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
5859 nwritten = write(fd,ptr,size);
5860 if (nwritten == -1) return -1;
5861 ptr += nwritten;
5862 size -= nwritten;
5863 }
5864 if ((time(NULL)-start) > timeout) {
5865 errno = ETIMEDOUT;
5866 return -1;
5867 }
5868 }
5869 return ret;
5870}
5871
a4d1ba9a 5872static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 5873 ssize_t nread, totread = 0;
5874 time_t start = time(NULL);
5875
5876 timeout++;
5877 while(size) {
5878 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
5879 nread = read(fd,ptr,size);
5880 if (nread == -1) return -1;
5881 ptr += nread;
5882 size -= nread;
5883 totread += nread;
5884 }
5885 if ((time(NULL)-start) > timeout) {
5886 errno = ETIMEDOUT;
5887 return -1;
5888 }
5889 }
5890 return totread;
5891}
5892
5893static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
5894 ssize_t nread = 0;
5895
5896 size--;
5897 while(size) {
5898 char c;
5899
5900 if (syncRead(fd,&c,1,timeout) == -1) return -1;
5901 if (c == '\n') {
5902 *ptr = '\0';
5903 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
5904 return nread;
5905 } else {
5906 *ptr++ = c;
5907 *ptr = '\0';
5908 nread++;
5909 }
5910 }
5911 return nread;
5912}
5913
5914static void syncCommand(redisClient *c) {
40d224a9 5915 /* ignore SYNC if aleady slave or in monitor mode */
5916 if (c->flags & REDIS_SLAVE) return;
5917
5918 /* SYNC can't be issued when the server has pending data to send to
5919 * the client about already issued commands. We need a fresh reply
5920 * buffer registering the differences between the BGSAVE and the current
5921 * dataset, so that we can copy to other slaves if needed. */
5922 if (listLength(c->reply) != 0) {
5923 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
5924 return;
5925 }
5926
5927 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
5928 /* Here we need to check if there is a background saving operation
5929 * in progress, or if it is required to start one */
9d65a1bb 5930 if (server.bgsavechildpid != -1) {
40d224a9 5931 /* Ok a background save is in progress. Let's check if it is a good
5932 * one for replication, i.e. if there is another slave that is
5933 * registering differences since the server forked to save */
5934 redisClient *slave;
5935 listNode *ln;
5936
6208b3a7 5937 listRewind(server.slaves);
5938 while((ln = listYield(server.slaves))) {
40d224a9 5939 slave = ln->value;
5940 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 5941 }
5942 if (ln) {
5943 /* Perfect, the server is already registering differences for
5944 * another slave. Set the right state, and copy the buffer. */
5945 listRelease(c->reply);
5946 c->reply = listDup(slave->reply);
40d224a9 5947 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
5948 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
5949 } else {
5950 /* No way, we need to wait for the next BGSAVE in order to
5951 * register differences */
5952 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
5953 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
5954 }
5955 } else {
5956 /* Ok we don't have a BGSAVE in progress, let's start one */
5957 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
5958 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
5959 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
5960 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
5961 return;
5962 }
5963 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
5964 }
6208b3a7 5965 c->repldbfd = -1;
40d224a9 5966 c->flags |= REDIS_SLAVE;
5967 c->slaveseldb = 0;
6b47e12e 5968 listAddNodeTail(server.slaves,c);
40d224a9 5969 return;
5970}
5971
6208b3a7 5972static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
5973 redisClient *slave = privdata;
5974 REDIS_NOTUSED(el);
5975 REDIS_NOTUSED(mask);
5976 char buf[REDIS_IOBUF_LEN];
5977 ssize_t nwritten, buflen;
5978
5979 if (slave->repldboff == 0) {
5980 /* Write the bulk write count before to transfer the DB. In theory here
5981 * we don't know how much room there is in the output buffer of the
5982 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
5983 * operations) will never be smaller than the few bytes we need. */
5984 sds bulkcount;
5985
5986 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
5987 slave->repldbsize);
5988 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
5989 {
5990 sdsfree(bulkcount);
5991 freeClient(slave);
5992 return;
5993 }
5994 sdsfree(bulkcount);
5995 }
5996 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
5997 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
5998 if (buflen <= 0) {
5999 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6000 (buflen == 0) ? "premature EOF" : strerror(errno));
6001 freeClient(slave);
6002 return;
6003 }
6004 if ((nwritten = write(fd,buf,buflen)) == -1) {
6005 redisLog(REDIS_DEBUG,"Write error sending DB to slave: %s",
6006 strerror(errno));
6007 freeClient(slave);
6008 return;
6009 }
6010 slave->repldboff += nwritten;
6011 if (slave->repldboff == slave->repldbsize) {
6012 close(slave->repldbfd);
6013 slave->repldbfd = -1;
6014 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6015 slave->replstate = REDIS_REPL_ONLINE;
6016 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 6017 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 6018 freeClient(slave);
6019 return;
6020 }
6021 addReplySds(slave,sdsempty());
6022 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6023 }
6024}
ed9b544e 6025
a3b21203 6026/* This function is called at the end of every backgrond saving.
6027 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6028 * otherwise REDIS_ERR is passed to the function.
6029 *
6030 * The goal of this function is to handle slaves waiting for a successful
6031 * background saving in order to perform non-blocking synchronization. */
6032static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 6033 listNode *ln;
6034 int startbgsave = 0;
ed9b544e 6035
6208b3a7 6036 listRewind(server.slaves);
6037 while((ln = listYield(server.slaves))) {
6038 redisClient *slave = ln->value;
ed9b544e 6039
6208b3a7 6040 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6041 startbgsave = 1;
6042 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6043 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 6044 struct redis_stat buf;
6208b3a7 6045
6046 if (bgsaveerr != REDIS_OK) {
6047 freeClient(slave);
6048 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6049 continue;
6050 }
6051 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 6052 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 6053 freeClient(slave);
6054 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6055 continue;
6056 }
6057 slave->repldboff = 0;
6058 slave->repldbsize = buf.st_size;
6059 slave->replstate = REDIS_REPL_SEND_BULK;
6060 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 6061 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 6062 freeClient(slave);
6063 continue;
6064 }
6065 }
ed9b544e 6066 }
6208b3a7 6067 if (startbgsave) {
6068 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6069 listRewind(server.slaves);
6070 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
6071 while((ln = listYield(server.slaves))) {
6072 redisClient *slave = ln->value;
ed9b544e 6073
6208b3a7 6074 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6075 freeClient(slave);
6076 }
6077 }
6078 }
ed9b544e 6079}
6080
6081static int syncWithMaster(void) {
d0ccebcf 6082 char buf[1024], tmpfile[256], authcmd[1024];
ed9b544e 6083 int dumpsize;
6084 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6085 int dfd;
6086
6087 if (fd == -1) {
6088 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6089 strerror(errno));
6090 return REDIS_ERR;
6091 }
d0ccebcf 6092
6093 /* AUTH with the master if required. */
6094 if(server.masterauth) {
6095 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6096 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6097 close(fd);
6098 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6099 strerror(errno));
6100 return REDIS_ERR;
6101 }
6102 /* Read the AUTH result. */
6103 if (syncReadLine(fd,buf,1024,3600) == -1) {
6104 close(fd);
6105 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6106 strerror(errno));
6107 return REDIS_ERR;
6108 }
6109 if (buf[0] != '+') {
6110 close(fd);
6111 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6112 return REDIS_ERR;
6113 }
6114 }
6115
ed9b544e 6116 /* Issue the SYNC command */
6117 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6118 close(fd);
6119 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6120 strerror(errno));
6121 return REDIS_ERR;
6122 }
6123 /* Read the bulk write count */
8c4d91fc 6124 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 6125 close(fd);
6126 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6127 strerror(errno));
6128 return REDIS_ERR;
6129 }
4aa701c1 6130 if (buf[0] != '$') {
6131 close(fd);
6132 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6133 return REDIS_ERR;
6134 }
c937aa89 6135 dumpsize = atoi(buf+1);
ed9b544e 6136 redisLog(REDIS_NOTICE,"Receiving %d bytes data dump from MASTER",dumpsize);
6137 /* Read the bulk write data on a temp file */
6138 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6139 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6140 if (dfd == -1) {
6141 close(fd);
6142 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6143 return REDIS_ERR;
6144 }
6145 while(dumpsize) {
6146 int nread, nwritten;
6147
6148 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6149 if (nread == -1) {
6150 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6151 strerror(errno));
6152 close(fd);
6153 close(dfd);
6154 return REDIS_ERR;
6155 }
6156 nwritten = write(dfd,buf,nread);
6157 if (nwritten == -1) {
6158 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6159 close(fd);
6160 close(dfd);
6161 return REDIS_ERR;
6162 }
6163 dumpsize -= nread;
6164 }
6165 close(dfd);
6166 if (rename(tmpfile,server.dbfilename) == -1) {
6167 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
6168 unlink(tmpfile);
6169 close(fd);
6170 return REDIS_ERR;
6171 }
6172 emptyDb();
f78fd11b 6173 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 6174 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
6175 close(fd);
6176 return REDIS_ERR;
6177 }
6178 server.master = createClient(fd);
6179 server.master->flags |= REDIS_MASTER;
179b3952 6180 server.master->authenticated = 1;
ed9b544e 6181 server.replstate = REDIS_REPL_CONNECTED;
6182 return REDIS_OK;
6183}
6184
321b0e13 6185static void slaveofCommand(redisClient *c) {
6186 if (!strcasecmp(c->argv[1]->ptr,"no") &&
6187 !strcasecmp(c->argv[2]->ptr,"one")) {
6188 if (server.masterhost) {
6189 sdsfree(server.masterhost);
6190 server.masterhost = NULL;
6191 if (server.master) freeClient(server.master);
6192 server.replstate = REDIS_REPL_NONE;
6193 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
6194 }
6195 } else {
6196 sdsfree(server.masterhost);
6197 server.masterhost = sdsdup(c->argv[1]->ptr);
6198 server.masterport = atoi(c->argv[2]->ptr);
6199 if (server.master) freeClient(server.master);
6200 server.replstate = REDIS_REPL_CONNECT;
6201 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
6202 server.masterhost, server.masterport);
6203 }
6204 addReply(c,shared.ok);
6205}
6206
3fd78bcd 6207/* ============================ Maxmemory directive ======================== */
6208
6209/* This function gets called when 'maxmemory' is set on the config file to limit
6210 * the max memory used by the server, and we are out of memory.
6211 * This function will try to, in order:
6212 *
6213 * - Free objects from the free list
6214 * - Try to remove keys with an EXPIRE set
6215 *
6216 * It is not possible to free enough memory to reach used-memory < maxmemory
6217 * the server will start refusing commands that will enlarge even more the
6218 * memory usage.
6219 */
6220static void freeMemoryIfNeeded(void) {
6221 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
6222 if (listLength(server.objfreelist)) {
6223 robj *o;
6224
6225 listNode *head = listFirst(server.objfreelist);
6226 o = listNodeValue(head);
6227 listDelNode(server.objfreelist,head);
6228 zfree(o);
6229 } else {
6230 int j, k, freed = 0;
6231
6232 for (j = 0; j < server.dbnum; j++) {
6233 int minttl = -1;
6234 robj *minkey = NULL;
6235 struct dictEntry *de;
6236
6237 if (dictSize(server.db[j].expires)) {
6238 freed = 1;
6239 /* From a sample of three keys drop the one nearest to
6240 * the natural expire */
6241 for (k = 0; k < 3; k++) {
6242 time_t t;
6243
6244 de = dictGetRandomKey(server.db[j].expires);
6245 t = (time_t) dictGetEntryVal(de);
6246 if (minttl == -1 || t < minttl) {
6247 minkey = dictGetEntryKey(de);
6248 minttl = t;
6249 }
6250 }
6251 deleteKey(server.db+j,minkey);
6252 }
6253 }
6254 if (!freed) return; /* nothing to free... */
6255 }
6256 }
6257}
6258
f80dff62 6259/* ============================== Append Only file ========================== */
6260
6261static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6262 sds buf = sdsempty();
6263 int j;
6264 ssize_t nwritten;
6265 time_t now;
6266 robj *tmpargv[3];
6267
6268 /* The DB this command was targetting is not the same as the last command
6269 * we appendend. To issue a SELECT command is needed. */
6270 if (dictid != server.appendseldb) {
6271 char seldb[64];
6272
6273 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 6274 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 6275 (unsigned long)strlen(seldb),seldb);
f80dff62 6276 server.appendseldb = dictid;
6277 }
6278
6279 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6280 * EXPIREs into EXPIREATs calls */
6281 if (cmd->proc == expireCommand) {
6282 long when;
6283
6284 tmpargv[0] = createStringObject("EXPIREAT",8);
6285 tmpargv[1] = argv[1];
6286 incrRefCount(argv[1]);
6287 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
6288 tmpargv[2] = createObject(REDIS_STRING,
6289 sdscatprintf(sdsempty(),"%ld",when));
6290 argv = tmpargv;
6291 }
6292
6293 /* Append the actual command */
6294 buf = sdscatprintf(buf,"*%d\r\n",argc);
6295 for (j = 0; j < argc; j++) {
6296 robj *o = argv[j];
6297
9d65a1bb 6298 o = getDecodedObject(o);
83c6a618 6299 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 6300 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
6301 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 6302 decrRefCount(o);
f80dff62 6303 }
6304
6305 /* Free the objects from the modified argv for EXPIREAT */
6306 if (cmd->proc == expireCommand) {
6307 for (j = 0; j < 3; j++)
6308 decrRefCount(argv[j]);
6309 }
6310
6311 /* We want to perform a single write. This should be guaranteed atomic
6312 * at least if the filesystem we are writing is a real physical one.
6313 * While this will save us against the server being killed I don't think
6314 * there is much to do about the whole server stopping for power problems
6315 * or alike */
6316 nwritten = write(server.appendfd,buf,sdslen(buf));
6317 if (nwritten != (signed)sdslen(buf)) {
6318 /* Ooops, we are in troubles. The best thing to do for now is
6319 * to simply exit instead to give the illusion that everything is
6320 * working as expected. */
6321 if (nwritten == -1) {
6322 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
6323 } else {
6324 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
6325 }
6326 exit(1);
6327 }
85a83172 6328 /* If a background append only file rewriting is in progress we want to
6329 * accumulate the differences between the child DB and the current one
6330 * in a buffer, so that when the child process will do its work we
6331 * can append the differences to the new append only file. */
6332 if (server.bgrewritechildpid != -1)
6333 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
6334
6335 sdsfree(buf);
f80dff62 6336 now = time(NULL);
6337 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
6338 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
6339 now-server.lastfsync > 1))
6340 {
6341 fsync(server.appendfd); /* Let's try to get this data on the disk */
6342 server.lastfsync = now;
6343 }
6344}
6345
6346/* In Redis commands are always executed in the context of a client, so in
6347 * order to load the append only file we need to create a fake client. */
6348static struct redisClient *createFakeClient(void) {
6349 struct redisClient *c = zmalloc(sizeof(*c));
6350
6351 selectDb(c,0);
6352 c->fd = -1;
6353 c->querybuf = sdsempty();
6354 c->argc = 0;
6355 c->argv = NULL;
6356 c->flags = 0;
9387d17d 6357 /* We set the fake client as a slave waiting for the synchronization
6358 * so that Redis will not try to send replies to this client. */
6359 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 6360 c->reply = listCreate();
6361 listSetFreeMethod(c->reply,decrRefCount);
6362 listSetDupMethod(c->reply,dupClientReplyValue);
6363 return c;
6364}
6365
6366static void freeFakeClient(struct redisClient *c) {
6367 sdsfree(c->querybuf);
6368 listRelease(c->reply);
6369 zfree(c);
6370}
6371
6372/* Replay the append log file. On error REDIS_OK is returned. On non fatal
6373 * error (the append only file is zero-length) REDIS_ERR is returned. On
6374 * fatal error an error message is logged and the program exists. */
6375int loadAppendOnlyFile(char *filename) {
6376 struct redisClient *fakeClient;
6377 FILE *fp = fopen(filename,"r");
6378 struct redis_stat sb;
6379
6380 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
6381 return REDIS_ERR;
6382
6383 if (fp == NULL) {
6384 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
6385 exit(1);
6386 }
6387
6388 fakeClient = createFakeClient();
6389 while(1) {
6390 int argc, j;
6391 unsigned long len;
6392 robj **argv;
6393 char buf[128];
6394 sds argsds;
6395 struct redisCommand *cmd;
6396
6397 if (fgets(buf,sizeof(buf),fp) == NULL) {
6398 if (feof(fp))
6399 break;
6400 else
6401 goto readerr;
6402 }
6403 if (buf[0] != '*') goto fmterr;
6404 argc = atoi(buf+1);
6405 argv = zmalloc(sizeof(robj*)*argc);
6406 for (j = 0; j < argc; j++) {
6407 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
6408 if (buf[0] != '$') goto fmterr;
6409 len = strtol(buf+1,NULL,10);
6410 argsds = sdsnewlen(NULL,len);
0f151ef1 6411 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 6412 argv[j] = createObject(REDIS_STRING,argsds);
6413 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
6414 }
6415
6416 /* Command lookup */
6417 cmd = lookupCommand(argv[0]->ptr);
6418 if (!cmd) {
6419 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
6420 exit(1);
6421 }
6422 /* Try object sharing and encoding */
6423 if (server.shareobjects) {
6424 int j;
6425 for(j = 1; j < argc; j++)
6426 argv[j] = tryObjectSharing(argv[j]);
6427 }
6428 if (cmd->flags & REDIS_CMD_BULK)
6429 tryObjectEncoding(argv[argc-1]);
6430 /* Run the command in the context of a fake client */
6431 fakeClient->argc = argc;
6432 fakeClient->argv = argv;
6433 cmd->proc(fakeClient);
6434 /* Discard the reply objects list from the fake client */
6435 while(listLength(fakeClient->reply))
6436 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
6437 /* Clean up, ready for the next command */
6438 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
6439 zfree(argv);
6440 }
6441 fclose(fp);
6442 freeFakeClient(fakeClient);
6443 return REDIS_OK;
6444
6445readerr:
6446 if (feof(fp)) {
6447 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
6448 } else {
6449 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
6450 }
6451 exit(1);
6452fmterr:
6453 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
6454 exit(1);
6455}
6456
9d65a1bb 6457/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
6458static int fwriteBulk(FILE *fp, robj *obj) {
6459 char buf[128];
6460 obj = getDecodedObject(obj);
6461 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
6462 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 6463 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
6464 goto err;
9d65a1bb 6465 if (fwrite("\r\n",2,1,fp) == 0) goto err;
6466 decrRefCount(obj);
6467 return 1;
6468err:
6469 decrRefCount(obj);
6470 return 0;
6471}
6472
6473/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
6474static int fwriteBulkDouble(FILE *fp, double d) {
6475 char buf[128], dbuf[128];
6476
6477 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
6478 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
6479 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
6480 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
6481 return 1;
6482}
6483
6484/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
6485static int fwriteBulkLong(FILE *fp, long l) {
6486 char buf[128], lbuf[128];
6487
6488 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
6489 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
6490 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
6491 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
6492 return 1;
6493}
6494
6495/* Write a sequence of commands able to fully rebuild the dataset into
6496 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
6497static int rewriteAppendOnlyFile(char *filename) {
6498 dictIterator *di = NULL;
6499 dictEntry *de;
6500 FILE *fp;
6501 char tmpfile[256];
6502 int j;
6503 time_t now = time(NULL);
6504
6505 /* Note that we have to use a different temp name here compared to the
6506 * one used by rewriteAppendOnlyFileBackground() function. */
6507 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
6508 fp = fopen(tmpfile,"w");
6509 if (!fp) {
6510 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
6511 return REDIS_ERR;
6512 }
6513 for (j = 0; j < server.dbnum; j++) {
6514 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
6515 redisDb *db = server.db+j;
6516 dict *d = db->dict;
6517 if (dictSize(d) == 0) continue;
6518 di = dictGetIterator(d);
6519 if (!di) {
6520 fclose(fp);
6521 return REDIS_ERR;
6522 }
6523
6524 /* SELECT the new DB */
6525 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 6526 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 6527
6528 /* Iterate this DB writing every entry */
6529 while((de = dictNext(di)) != NULL) {
6530 robj *key = dictGetEntryKey(de);
6531 robj *o = dictGetEntryVal(de);
6532 time_t expiretime = getExpire(db,key);
6533
6534 /* Save the key and associated value */
9d65a1bb 6535 if (o->type == REDIS_STRING) {
6536 /* Emit a SET command */
6537 char cmd[]="*3\r\n$3\r\nSET\r\n";
6538 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6539 /* Key and value */
6540 if (fwriteBulk(fp,key) == 0) goto werr;
6541 if (fwriteBulk(fp,o) == 0) goto werr;
6542 } else if (o->type == REDIS_LIST) {
6543 /* Emit the RPUSHes needed to rebuild the list */
6544 list *list = o->ptr;
6545 listNode *ln;
6546
6547 listRewind(list);
6548 while((ln = listYield(list))) {
6549 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
6550 robj *eleobj = listNodeValue(ln);
6551
6552 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6553 if (fwriteBulk(fp,key) == 0) goto werr;
6554 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6555 }
6556 } else if (o->type == REDIS_SET) {
6557 /* Emit the SADDs needed to rebuild the set */
6558 dict *set = o->ptr;
6559 dictIterator *di = dictGetIterator(set);
6560 dictEntry *de;
6561
6562 while((de = dictNext(di)) != NULL) {
6563 char cmd[]="*3\r\n$4\r\nSADD\r\n";
6564 robj *eleobj = dictGetEntryKey(de);
6565
6566 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6567 if (fwriteBulk(fp,key) == 0) goto werr;
6568 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6569 }
6570 dictReleaseIterator(di);
6571 } else if (o->type == REDIS_ZSET) {
6572 /* Emit the ZADDs needed to rebuild the sorted set */
6573 zset *zs = o->ptr;
6574 dictIterator *di = dictGetIterator(zs->dict);
6575 dictEntry *de;
6576
6577 while((de = dictNext(di)) != NULL) {
6578 char cmd[]="*4\r\n$4\r\nZADD\r\n";
6579 robj *eleobj = dictGetEntryKey(de);
6580 double *score = dictGetEntryVal(de);
6581
6582 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6583 if (fwriteBulk(fp,key) == 0) goto werr;
6584 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
6585 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6586 }
6587 dictReleaseIterator(di);
6588 } else {
dfc5e96c 6589 redisAssert(0 != 0);
9d65a1bb 6590 }
6591 /* Save the expire time */
6592 if (expiretime != -1) {
e96e4fbf 6593 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 6594 /* If this key is already expired skip it */
6595 if (expiretime < now) continue;
6596 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6597 if (fwriteBulk(fp,key) == 0) goto werr;
6598 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
6599 }
6600 }
6601 dictReleaseIterator(di);
6602 }
6603
6604 /* Make sure data will not remain on the OS's output buffers */
6605 fflush(fp);
6606 fsync(fileno(fp));
6607 fclose(fp);
6608
6609 /* Use RENAME to make sure the DB file is changed atomically only
6610 * if the generate DB file is ok. */
6611 if (rename(tmpfile,filename) == -1) {
6612 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
6613 unlink(tmpfile);
6614 return REDIS_ERR;
6615 }
6616 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
6617 return REDIS_OK;
6618
6619werr:
6620 fclose(fp);
6621 unlink(tmpfile);
e96e4fbf 6622 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 6623 if (di) dictReleaseIterator(di);
6624 return REDIS_ERR;
6625}
6626
6627/* This is how rewriting of the append only file in background works:
6628 *
6629 * 1) The user calls BGREWRITEAOF
6630 * 2) Redis calls this function, that forks():
6631 * 2a) the child rewrite the append only file in a temp file.
6632 * 2b) the parent accumulates differences in server.bgrewritebuf.
6633 * 3) When the child finished '2a' exists.
6634 * 4) The parent will trap the exit code, if it's OK, will append the
6635 * data accumulated into server.bgrewritebuf into the temp file, and
6636 * finally will rename(2) the temp file in the actual file name.
6637 * The the new file is reopened as the new append only file. Profit!
6638 */
6639static int rewriteAppendOnlyFileBackground(void) {
6640 pid_t childpid;
6641
6642 if (server.bgrewritechildpid != -1) return REDIS_ERR;
6643 if ((childpid = fork()) == 0) {
6644 /* Child */
6645 char tmpfile[256];
6646 close(server.fd);
6647
6648 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
6649 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
6650 exit(0);
6651 } else {
6652 exit(1);
6653 }
6654 } else {
6655 /* Parent */
6656 if (childpid == -1) {
6657 redisLog(REDIS_WARNING,
6658 "Can't rewrite append only file in background: fork: %s",
6659 strerror(errno));
6660 return REDIS_ERR;
6661 }
6662 redisLog(REDIS_NOTICE,
6663 "Background append only file rewriting started by pid %d",childpid);
6664 server.bgrewritechildpid = childpid;
85a83172 6665 /* We set appendseldb to -1 in order to force the next call to the
6666 * feedAppendOnlyFile() to issue a SELECT command, so the differences
6667 * accumulated by the parent into server.bgrewritebuf will start
6668 * with a SELECT statement and it will be safe to merge. */
6669 server.appendseldb = -1;
9d65a1bb 6670 return REDIS_OK;
6671 }
6672 return REDIS_OK; /* unreached */
6673}
6674
6675static void bgrewriteaofCommand(redisClient *c) {
6676 if (server.bgrewritechildpid != -1) {
6677 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
6678 return;
6679 }
6680 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 6681 char *status = "+Background append only file rewriting started\r\n";
6682 addReplySds(c,sdsnew(status));
9d65a1bb 6683 } else {
6684 addReply(c,shared.err);
6685 }
6686}
6687
6688static void aofRemoveTempFile(pid_t childpid) {
6689 char tmpfile[256];
6690
6691 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
6692 unlink(tmpfile);
6693}
6694
75680a3c 6695/* =============================== Virtual Memory =========================== */
6696static void vmInit(void) {
6697 off_t totsize;
6698
6699 server.vm_fp = fopen("/tmp/redisvm","w+b");
6700 if (server.vm_fp == NULL) {
6701 redisLog(REDIS_WARNING,"Impossible to open the swap file. Exiting.");
6702 exit(1);
6703 }
6704 server.vm_fd = fileno(server.vm_fp);
6705 server.vm_next_page = 0;
6706 server.vm_near_pages = 0;
6707 totsize = server.vm_pages*server.vm_page_size;
6708 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
6709 if (ftruncate(server.vm_fd,totsize) == -1) {
6710 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
6711 strerror(errno));
6712 exit(1);
6713 } else {
6714 redisLog(REDIS_NOTICE,"Swap file allocated with success");
6715 }
7d30035d 6716 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
4ef8de8a 6717 redisLog(REDIS_DEBUG,"Allocated %lld bytes page table for %lld pages",
6718 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 6719 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
75680a3c 6720 /* Try to remove the swap file, so the OS will really delete it from the
6721 * file system when Redis exists. */
6722 unlink("/tmp/redisvm");
6723}
6724
06224fec 6725/* Mark the page as used */
6726static void vmMarkPageUsed(off_t page) {
6727 off_t byte = page/8;
6728 int bit = page&7;
6729 server.vm_bitmap[byte] |= 1<<bit;
4ef8de8a 6730 printf("Mark used: %lld (byte:%lld bit:%d)\n", (long long)page,
6731 (long long)byte, bit);
06224fec 6732}
6733
6734/* Mark N contiguous pages as used, with 'page' being the first. */
6735static void vmMarkPagesUsed(off_t page, off_t count) {
6736 off_t j;
6737
6738 for (j = 0; j < count; j++)
7d30035d 6739 vmMarkPageUsed(page+j);
06224fec 6740}
6741
6742/* Mark the page as free */
6743static void vmMarkPageFree(off_t page) {
6744 off_t byte = page/8;
6745 int bit = page&7;
6746 server.vm_bitmap[byte] &= ~(1<<bit);
6747}
6748
6749/* Mark N contiguous pages as free, with 'page' being the first. */
6750static void vmMarkPagesFree(off_t page, off_t count) {
6751 off_t j;
6752
6753 for (j = 0; j < count; j++)
7d30035d 6754 vmMarkPageFree(page+j);
06224fec 6755}
6756
6757/* Test if the page is free */
6758static int vmFreePage(off_t page) {
6759 off_t byte = page/8;
6760 int bit = page&7;
7d30035d 6761 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 6762}
6763
6764/* Find N contiguous free pages storing the first page of the cluster in *first.
3a66edc7 6765 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
6766 * REDIS_ERR is returned.
06224fec 6767 *
6768 * This function uses a simple algorithm: we try to allocate
6769 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
6770 * again from the start of the swap file searching for free spaces.
6771 *
6772 * If it looks pretty clear that there are no free pages near our offset
6773 * we try to find less populated places doing a forward jump of
6774 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
6775 * without hurry, and then we jump again and so forth...
6776 *
6777 * This function can be improved using a free list to avoid to guess
6778 * too much, since we could collect data about freed pages.
6779 *
6780 * note: I implemented this function just after watching an episode of
6781 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
6782 */
6783static int vmFindContiguousPages(off_t *first, int n) {
6784 off_t base, offset = 0, since_jump = 0, numfree = 0;
6785
6786 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
6787 server.vm_near_pages = 0;
6788 server.vm_next_page = 0;
6789 }
6790 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
6791 base = server.vm_next_page;
6792
6793 while(offset < server.vm_pages) {
6794 off_t this = base+offset;
6795
7d30035d 6796 printf("THIS: %lld (%c)\n", (long long) this, vmFreePage(this) ? 'F' : 'X');
06224fec 6797 /* If we overflow, restart from page zero */
6798 if (this >= server.vm_pages) {
6799 this -= server.vm_pages;
6800 if (this == 0) {
6801 /* Just overflowed, what we found on tail is no longer
6802 * interesting, as it's no longer contiguous. */
6803 numfree = 0;
6804 }
6805 }
6806 if (vmFreePage(this)) {
6807 /* This is a free page */
6808 numfree++;
6809 /* Already got N free pages? Return to the caller, with success */
6810 if (numfree == n) {
7d30035d 6811 *first = this-(n-1);
6812 server.vm_next_page = this+1;
3a66edc7 6813 return REDIS_OK;
06224fec 6814 }
6815 } else {
6816 /* The current one is not a free page */
6817 numfree = 0;
6818 }
6819
6820 /* Fast-forward if the current page is not free and we already
6821 * searched enough near this place. */
6822 since_jump++;
6823 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
6824 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
6825 since_jump = 0;
6826 /* Note that even if we rewind after the jump, we are don't need
6827 * to make sure numfree is set to zero as we only jump *if* it
6828 * is set to zero. */
6829 } else {
6830 /* Otherwise just check the next page */
6831 offset++;
6832 }
6833 }
3a66edc7 6834 return REDIS_ERR;
6835}
6836
6837/* Swap the 'val' object relative to 'key' into disk. Store all the information
6838 * needed to later retrieve the object into the key object.
6839 * If we can't find enough contiguous empty pages to swap the object on disk
6840 * REDIS_ERR is returned. */
6841static int vmSwapObject(robj *key, robj *val) {
6842 off_t pages = rdbSavedObjectPages(val);
6843 off_t page;
6844
6845 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 6846 assert(key->refcount == 1);
3a66edc7 6847 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
6848 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
6849 redisLog(REDIS_WARNING,
6850 "Critical VM problem in vmSwapObject(): can't seek: %s",
6851 strerror(errno));
6852 return REDIS_ERR;
6853 }
6854 rdbSaveObject(server.vm_fp,val);
6855 key->vm.page = page;
6856 key->vm.usedpages = pages;
6857 key->storage = REDIS_VM_SWAPPED;
d894161b 6858 key->vtype = val->type;
3a66edc7 6859 decrRefCount(val); /* Deallocate the object from memory. */
6860 vmMarkPagesUsed(page,pages);
7d30035d 6861 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
6862 (unsigned char*) key->ptr,
6863 (unsigned long long) page, (unsigned long long) pages);
3a66edc7 6864 return REDIS_OK;
6865}
6866
6867/* Load the value object relative to the 'key' object from swap to memory.
7e69548d 6868 * The newly allocated object is returned.
6869 *
6870 * If preview is true the unserialized object is returned to the caller but
6871 * no changes are made to the key object, nor the pages are marked as freed */
6872static robj *vmGenericLoadObject(robj *key, int preview) {
3a66edc7 6873 robj *val;
6874
6875 assert(key->storage == REDIS_VM_SWAPPED);
6876 if (fseeko(server.vm_fp,key->vm.page*server.vm_page_size,SEEK_SET) == -1) {
6877 redisLog(REDIS_WARNING,
6878 "Unrecoverable VM problem in vmLoadObject(): can't seek: %s",
6879 strerror(errno));
6880 exit(1);
6881 }
d894161b 6882 val = rdbLoadObject(key->vtype,server.vm_fp);
3a66edc7 6883 if (val == NULL) {
6884 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmLoadObject(): can't load object from swap file: %s", strerror(errno));
6885 exit(1);
6886 }
7e69548d 6887 if (!preview) {
6888 key->storage = REDIS_VM_MEMORY;
6889 key->vm.atime = server.unixtime;
6890 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
6891 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
6892 (unsigned char*) key->ptr);
6893 }
3a66edc7 6894 return val;
06224fec 6895}
6896
7e69548d 6897/* Plain object loading, from swap to memory */
6898static robj *vmLoadObject(robj *key) {
6899 return vmGenericLoadObject(key,0);
6900}
6901
6902/* Just load the value on disk, without to modify the key.
6903 * This is useful when we want to perform some operation on the value
6904 * without to really bring it from swap to memory, like while saving the
6905 * dataset or rewriting the append only log. */
6906static robj *vmPreviewObject(robj *key) {
6907 return vmGenericLoadObject(key,1);
6908}
6909
4ef8de8a 6910/* How a good candidate is this object for swapping?
6911 * The better candidate it is, the greater the returned value.
6912 *
6913 * Currently we try to perform a fast estimation of the object size in
6914 * memory, and combine it with aging informations.
6915 *
6916 * Basically swappability = idle-time * log(estimated size)
6917 *
6918 * Bigger objects are preferred over smaller objects, but not
6919 * proportionally, this is why we use the logarithm. This algorithm is
6920 * just a first try and will probably be tuned later. */
6921static double computeObjectSwappability(robj *o) {
6922 time_t age = server.unixtime - o->vm.atime;
6923 long asize = 0;
6924 list *l;
6925 dict *d;
6926 struct dictEntry *de;
6927 int z;
6928
6929 if (age <= 0) return 0;
6930 switch(o->type) {
6931 case REDIS_STRING:
6932 if (o->encoding != REDIS_ENCODING_RAW) {
6933 asize = sizeof(*o);
6934 } else {
6935 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
6936 }
6937 break;
6938 case REDIS_LIST:
6939 l = o->ptr;
6940 listNode *ln = listFirst(l);
6941
6942 asize = sizeof(list);
6943 if (ln) {
6944 robj *ele = ln->value;
6945 long elesize;
6946
6947 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
6948 (sizeof(*o)+sdslen(ele->ptr)) :
6949 sizeof(*o);
6950 asize += (sizeof(listNode)+elesize)*listLength(l);
6951 }
6952 break;
6953 case REDIS_SET:
6954 case REDIS_ZSET:
6955 z = (o->type == REDIS_ZSET);
6956 d = z ? ((zset*)o->ptr)->dict : o->ptr;
6957
6958 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
6959 if (z) asize += sizeof(zset)-sizeof(dict);
6960 if (dictSize(d)) {
6961 long elesize;
6962 robj *ele;
6963
6964 de = dictGetRandomKey(d);
6965 ele = dictGetEntryKey(de);
6966 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
6967 (sizeof(*o)+sdslen(ele->ptr)) :
6968 sizeof(*o);
6969 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
6970 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
6971 }
6972 break;
6973 }
6974 return (double)asize*log(1+asize);
6975}
6976
6977/* Try to swap an object that's a good candidate for swapping.
6978 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
6979 * to swap any object at all. */
6980static int vmSwapOneObject(void) {
6981 int j, i;
6982 struct dictEntry *best = NULL;
6983 double best_swappability = 0;
6984 robj *key, *val;
6985
6986 for (j = 0; j < server.dbnum; j++) {
6987 redisDb *db = server.db+j;
e3cadb8a 6988 int maxtries = 1000;
4ef8de8a 6989
6990 if (dictSize(db->dict) == 0) continue;
6991 for (i = 0; i < 5; i++) {
6992 dictEntry *de;
6993 double swappability;
6994
e3cadb8a 6995 if (maxtries) maxtries--;
4ef8de8a 6996 de = dictGetRandomKey(db->dict);
6997 key = dictGetEntryKey(de);
6998 val = dictGetEntryVal(de);
e3cadb8a 6999 if (key->storage != REDIS_VM_MEMORY) {
7000 if (maxtries) i--; /* don't count this try */
7001 continue;
7002 }
4ef8de8a 7003 swappability = computeObjectSwappability(val);
7004 if (!best || swappability > best_swappability) {
7005 best = de;
7006 best_swappability = swappability;
7007 }
7008 }
7009 }
e3cadb8a 7010 if (best == NULL) {
7011 redisLog(REDIS_DEBUG,"No swappable key found!");
7012 return REDIS_ERR;
7013 }
4ef8de8a 7014 key = dictGetEntryKey(best);
7015 val = dictGetEntryVal(best);
7016
e3cadb8a 7017 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 7018 key->ptr, best_swappability);
7019
7020 /* Unshare the key if needed */
7021 if (key->refcount > 1) {
7022 robj *newkey = dupStringObject(key);
7023 decrRefCount(key);
7024 key = dictGetEntryKey(best) = newkey;
7025 }
7026 /* Swap it */
7027 if (vmSwapObject(key,val) == REDIS_OK) {
7028 dictGetEntryVal(best) = NULL;
7029 return REDIS_OK;
7030 } else {
7031 return REDIS_ERR;
7032 }
7033}
7034
7e69548d 7035/* Return true if it's safe to swap out objects in a given moment.
7036 * Basically we don't want to swap objects out while there is a BGSAVE
7037 * or a BGAEOREWRITE running in backgroud. */
7038static int vmCanSwapOut(void) {
7039 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
7040}
7041
1b03836c 7042/* Delete a key if swapped. Returns 1 if the key was found, was swapped
7043 * and was deleted. Otherwise 0 is returned. */
7044static int deleteIfSwapped(redisDb *db, robj *key) {
7045 dictEntry *de;
7046 robj *foundkey;
7047
7048 if ((de = dictFind(db->dict,key)) == NULL) return 0;
7049 foundkey = dictGetEntryKey(de);
7050 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
7051 deleteKey(db,key);
7052 return 1;
7053}
7054
7f957c92 7055/* ================================= Debugging ============================== */
7056
7057static void debugCommand(redisClient *c) {
7058 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
7059 *((char*)-1) = 'x';
210e29f7 7060 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
7061 if (rdbSave(server.dbfilename) != REDIS_OK) {
7062 addReply(c,shared.err);
7063 return;
7064 }
7065 emptyDb();
7066 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7067 addReply(c,shared.err);
7068 return;
7069 }
7070 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
7071 addReply(c,shared.ok);
71c2b467 7072 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
7073 emptyDb();
7074 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
7075 addReply(c,shared.err);
7076 return;
7077 }
7078 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
7079 addReply(c,shared.ok);
333298da 7080 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
7081 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
7082 robj *key, *val;
7083
7084 if (!de) {
7085 addReply(c,shared.nokeyerr);
7086 return;
7087 }
7088 key = dictGetEntryKey(de);
7089 val = dictGetEntryVal(de);
7090 addReplySds(c,sdscatprintf(sdsempty(),
06233c45 7091 "+Key at:%p refcount:%d, value at:%p refcount:%d encoding:%d serializedlength:%lld\r\n",
682ac724 7092 (void*)key, key->refcount, (void*)val, val->refcount,
06233c45 7093 val->encoding, rdbSavedObjectLen(val)));
7d30035d 7094 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
7095 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
7096 robj *key, *val;
7097
7098 if (!server.vm_enabled) {
7099 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
7100 return;
7101 }
7102 if (!de) {
7103 addReply(c,shared.nokeyerr);
7104 return;
7105 }
7106 key = dictGetEntryKey(de);
7107 val = dictGetEntryVal(de);
4ef8de8a 7108 /* If the key is shared we want to create a copy */
7109 if (key->refcount > 1) {
7110 robj *newkey = dupStringObject(key);
7111 decrRefCount(key);
7112 key = dictGetEntryKey(de) = newkey;
7113 }
7114 /* Swap it */
7d30035d 7115 if (key->storage != REDIS_VM_MEMORY) {
7116 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
7117 } else if (vmSwapObject(key,val) == REDIS_OK) {
7118 dictGetEntryVal(de) = NULL;
7119 addReply(c,shared.ok);
7120 } else {
7121 addReply(c,shared.err);
7122 }
7f957c92 7123 } else {
333298da 7124 addReplySds(c,sdsnew(
7d30035d 7125 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 7126 }
7127}
56906eef 7128
dfc5e96c 7129static void _redisAssert(char *estr) {
7130 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
7131 redisLog(REDIS_WARNING,"==> %s\n",estr);
7132#ifdef HAVE_BACKTRACE
7133 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
7134 *((char*)-1) = 'x';
7135#endif
7136}
7137
bcfc686d 7138/* =================================== Main! ================================ */
56906eef 7139
bcfc686d 7140#ifdef __linux__
7141int linuxOvercommitMemoryValue(void) {
7142 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
7143 char buf[64];
56906eef 7144
bcfc686d 7145 if (!fp) return -1;
7146 if (fgets(buf,64,fp) == NULL) {
7147 fclose(fp);
7148 return -1;
7149 }
7150 fclose(fp);
56906eef 7151
bcfc686d 7152 return atoi(buf);
7153}
7154
7155void linuxOvercommitMemoryWarning(void) {
7156 if (linuxOvercommitMemoryValue() == 0) {
7157 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
7158 }
7159}
7160#endif /* __linux__ */
7161
7162static void daemonize(void) {
7163 int fd;
7164 FILE *fp;
7165
7166 if (fork() != 0) exit(0); /* parent exits */
71c54b21 7167 printf("New pid: %d\n", getpid());
bcfc686d 7168 setsid(); /* create a new session */
7169
7170 /* Every output goes to /dev/null. If Redis is daemonized but
7171 * the 'logfile' is set to 'stdout' in the configuration file
7172 * it will not log at all. */
7173 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
7174 dup2(fd, STDIN_FILENO);
7175 dup2(fd, STDOUT_FILENO);
7176 dup2(fd, STDERR_FILENO);
7177 if (fd > STDERR_FILENO) close(fd);
7178 }
7179 /* Try to write the pid file */
7180 fp = fopen(server.pidfile,"w");
7181 if (fp) {
7182 fprintf(fp,"%d\n",getpid());
7183 fclose(fp);
56906eef 7184 }
56906eef 7185}
7186
bcfc686d 7187int main(int argc, char **argv) {
7188 initServerConfig();
7189 if (argc == 2) {
7190 resetServerSaveParams();
7191 loadServerConfig(argv[1]);
7192 } else if (argc > 2) {
7193 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
7194 exit(1);
7195 } else {
7196 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
7197 }
bcfc686d 7198 if (server.daemonize) daemonize();
71c54b21 7199 initServer();
bcfc686d 7200 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
7201#ifdef __linux__
7202 linuxOvercommitMemoryWarning();
7203#endif
7204 if (server.appendonly) {
7205 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
7206 redisLog(REDIS_NOTICE,"DB loaded from append only file");
7207 } else {
7208 if (rdbLoad(server.dbfilename) == REDIS_OK)
7209 redisLog(REDIS_NOTICE,"DB loaded from disk");
7210 }
7211 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
266373b2 7212 acceptHandler, NULL) == AE_ERR) oom("creating file event");
bcfc686d 7213 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
7214 aeMain(server.el);
7215 aeDeleteEventLoop(server.el);
7216 return 0;
7217}
7218
7219/* ============================= Backtrace support ========================= */
7220
7221#ifdef HAVE_BACKTRACE
7222static char *findFuncName(void *pointer, unsigned long *offset);
7223
56906eef 7224static void *getMcontextEip(ucontext_t *uc) {
7225#if defined(__FreeBSD__)
7226 return (void*) uc->uc_mcontext.mc_eip;
7227#elif defined(__dietlibc__)
7228 return (void*) uc->uc_mcontext.eip;
06db1f50 7229#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 7230 #if __x86_64__
7231 return (void*) uc->uc_mcontext->__ss.__rip;
7232 #else
56906eef 7233 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 7234 #endif
06db1f50 7235#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 7236 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 7237 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 7238 #else
7239 return (void*) uc->uc_mcontext->__ss.__eip;
7240 #endif
c04c9ac9 7241#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
7242 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 7243#elif defined(__ia64__) /* Linux IA64 */
7244 return (void*) uc->uc_mcontext.sc_ip;
7245#else
7246 return NULL;
56906eef 7247#endif
7248}
7249
7250static void segvHandler(int sig, siginfo_t *info, void *secret) {
7251 void *trace[100];
7252 char **messages = NULL;
7253 int i, trace_size = 0;
7254 unsigned long offset=0;
56906eef 7255 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 7256 sds infostring;
56906eef 7257 REDIS_NOTUSED(info);
7258
7259 redisLog(REDIS_WARNING,
7260 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 7261 infostring = genRedisInfoString();
7262 redisLog(REDIS_WARNING, "%s",infostring);
7263 /* It's not safe to sdsfree() the returned string under memory
7264 * corruption conditions. Let it leak as we are going to abort */
56906eef 7265
7266 trace_size = backtrace(trace, 100);
de96dbfe 7267 /* overwrite sigaction with caller's address */
b91cf5ef 7268 if (getMcontextEip(uc) != NULL) {
7269 trace[1] = getMcontextEip(uc);
7270 }
56906eef 7271 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 7272
d76412d1 7273 for (i=1; i<trace_size; ++i) {
56906eef 7274 char *fn = findFuncName(trace[i], &offset), *p;
7275
7276 p = strchr(messages[i],'+');
7277 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
7278 redisLog(REDIS_WARNING,"%s", messages[i]);
7279 } else {
7280 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
7281 }
7282 }
b177fd30 7283 /* free(messages); Don't call free() with possibly corrupted memory. */
56906eef 7284 exit(0);
fe3bbfbe 7285}
56906eef 7286
7287static void setupSigSegvAction(void) {
7288 struct sigaction act;
7289
7290 sigemptyset (&act.sa_mask);
7291 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
7292 * is used. Otherwise, sa_handler is used */
7293 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
7294 act.sa_sigaction = segvHandler;
7295 sigaction (SIGSEGV, &act, NULL);
7296 sigaction (SIGBUS, &act, NULL);
12fea928 7297 sigaction (SIGFPE, &act, NULL);
7298 sigaction (SIGILL, &act, NULL);
7299 sigaction (SIGBUS, &act, NULL);
e65fdc78 7300 return;
56906eef 7301}
e65fdc78 7302
bcfc686d 7303#include "staticsymbols.h"
7304/* This function try to convert a pointer into a function name. It's used in
7305 * oreder to provide a backtrace under segmentation fault that's able to
7306 * display functions declared as static (otherwise the backtrace is useless). */
7307static char *findFuncName(void *pointer, unsigned long *offset){
7308 int i, ret = -1;
7309 unsigned long off, minoff = 0;
ed9b544e 7310
bcfc686d 7311 /* Try to match against the Symbol with the smallest offset */
7312 for (i=0; symsTable[i].pointer; i++) {
7313 unsigned long lp = (unsigned long) pointer;
0bc03378 7314
bcfc686d 7315 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
7316 off=lp-symsTable[i].pointer;
7317 if (ret < 0 || off < minoff) {
7318 minoff=off;
7319 ret=i;
7320 }
7321 }
0bc03378 7322 }
bcfc686d 7323 if (ret == -1) return NULL;
7324 *offset = minoff;
7325 return symsTable[ret].name;
0bc03378 7326}
bcfc686d 7327#else /* HAVE_BACKTRACE */
7328static void setupSigSegvAction(void) {
0bc03378 7329}
bcfc686d 7330#endif /* HAVE_BACKTRACE */
0bc03378 7331
ed9b544e 7332
ed9b544e 7333
bcfc686d 7334/* The End */
7335
7336
ed9b544e 7337