]> git.saurik.com Git - redis.git/blob - redis.c
04c2c80b2711eafd55cb68fe50e460068017f8a5
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.7"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132 };
133
134 /* Object types only used for dumping to disk */
135 #define REDIS_EXPIRETIME 253
136 #define REDIS_SELECTDB 254
137 #define REDIS_EOF 255
138
139 /* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
149 *
150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
152 #define REDIS_RDB_6BITLEN 0
153 #define REDIS_RDB_14BITLEN 1
154 #define REDIS_RDB_32BITLEN 2
155 #define REDIS_RDB_ENCVAL 3
156 #define REDIS_RDB_LENERR UINT_MAX
157
158 /* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
164 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
165
166 /* Virtual memory object->where field. */
167 #define REDIS_VM_MEMORY 0 /* The object is on memory */
168 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
169 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
172 /* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174 #define REDIS_VM_MAX_NEAR_PAGES 65536
175 #define REDIS_VM_MAX_RANDOM_JUMP 4096
176 #define REDIS_VM_MAX_THREADS 32
177 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
178 /* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
182 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
183
184 /* Client flags */
185 #define REDIS_SLAVE 1 /* This client is a slave server */
186 #define REDIS_MASTER 2 /* This client is a master server */
187 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188 #define REDIS_MULTI 8 /* This client is in a MULTI context */
189 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
191
192 /* Slave replication state - slave side */
193 #define REDIS_REPL_NONE 0 /* No active replication */
194 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
195 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
197 /* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
206 /* List related stuff */
207 #define REDIS_HEAD 0
208 #define REDIS_TAIL 1
209
210 /* Sort operations */
211 #define REDIS_SORT_GET 0
212 #define REDIS_SORT_ASC 1
213 #define REDIS_SORT_DESC 2
214 #define REDIS_SORTKEY_MAX 1024
215
216 /* Log levels */
217 #define REDIS_DEBUG 0
218 #define REDIS_VERBOSE 1
219 #define REDIS_NOTICE 2
220 #define REDIS_WARNING 3
221
222 /* Anti-warning macro... */
223 #define REDIS_NOTUSED(V) ((void) V)
224
225 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
227
228 /* Append only defines */
229 #define APPENDFSYNC_NO 0
230 #define APPENDFSYNC_ALWAYS 1
231 #define APPENDFSYNC_EVERYSEC 2
232
233 /* Hashes related defaults */
234 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
237 /* We can print the stacktrace, so our assert is defined this way: */
238 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
239 static void _redisAssert(char *estr, char *file, int line);
240
241 /*================================= Data types ============================== */
242
243 /* A redis object, that is a type able to hold a string / list / set */
244
245 /* The VM object structure */
246 struct redisObjectVM {
247 off_t page; /* the page at witch the object is stored on disk */
248 off_t usedpages; /* number of pages used on disk */
249 time_t atime; /* Last access time */
250 } vm;
251
252 /* The actual Redis Object */
253 typedef struct redisObject {
254 void *ptr;
255 unsigned char type;
256 unsigned char encoding;
257 unsigned char storage; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
261 int refcount;
262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm;
267 } robj;
268
269 /* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273 #define initStaticStringObject(_var,_ptr) do { \
274 _var.refcount = 1; \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
277 _var.ptr = _ptr; \
278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
279 } while(0);
280
281 typedef struct redisDb {
282 dict *dict; /* The keyspace for this DB */
283 dict *expires; /* Timeout of keys with a timeout set */
284 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
285 dict *io_keys; /* Keys with clients waiting for VM I/O */
286 int id;
287 } redisDb;
288
289 /* Client MULTI/EXEC state */
290 typedef struct multiCmd {
291 robj **argv;
292 int argc;
293 struct redisCommand *cmd;
294 } multiCmd;
295
296 typedef struct multiState {
297 multiCmd *commands; /* Array of MULTI commands */
298 int count; /* Total number of MULTI commands */
299 } multiState;
300
301 /* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303 typedef struct redisClient {
304 int fd;
305 redisDb *db;
306 int dictid;
307 sds querybuf;
308 robj **argv, **mbargv;
309 int argc, mbargc;
310 int bulklen; /* bulk read len. -1 if not in bulk read mode */
311 int multibulk; /* multi bulk command format active */
312 list *reply;
313 int sentlen;
314 time_t lastinteraction; /* time of the last interaction, used for timeout */
315 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
316 int slaveseldb; /* slave selected db, if this client is a slave */
317 int authenticated; /* when requirepass is non-NULL */
318 int replstate; /* replication state if this is a slave */
319 int repldbfd; /* replication DB file descriptor */
320 long repldboff; /* replication DB file offset */
321 off_t repldbsize; /* replication DB file size */
322 multiState mstate; /* MULTI/EXEC state */
323 robj **blockingkeys; /* The key we are waiting to terminate a blocking
324 * operation such as BLPOP. Otherwise NULL. */
325 int blockingkeysnum; /* Number of blocking keys */
326 time_t blockingto; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
328 list *io_keys; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
330 } redisClient;
331
332 struct saveparam {
333 time_t seconds;
334 int changes;
335 };
336
337 /* Global server state structure */
338 struct redisServer {
339 int port;
340 int fd;
341 redisDb *db;
342 dict *sharingpool; /* Poll used for object sharing */
343 unsigned int sharingpoolsize;
344 long long dirty; /* changes to DB from the last save */
345 list *clients;
346 list *slaves, *monitors;
347 char neterr[ANET_ERR_LEN];
348 aeEventLoop *el;
349 int cronloops; /* number of times the cron function run */
350 list *objfreelist; /* A list of freed objects to avoid malloc() */
351 time_t lastsave; /* Unix time of last save succeeede */
352 /* Fields used only for stats */
353 time_t stat_starttime; /* server start time */
354 long long stat_numcommands; /* number of processed commands */
355 long long stat_numconnections; /* number of connections received */
356 long long stat_expiredkeys; /* number of expired keys */
357 /* Configuration */
358 int verbosity;
359 int glueoutputbuf;
360 int maxidletime;
361 int dbnum;
362 int daemonize;
363 int appendonly;
364 int appendfsync;
365 time_t lastfsync;
366 int appendfd;
367 int appendseldb;
368 char *pidfile;
369 pid_t bgsavechildpid;
370 pid_t bgrewritechildpid;
371 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
372 struct saveparam *saveparams;
373 int saveparamslen;
374 char *logfile;
375 char *bindaddr;
376 char *dbfilename;
377 char *appendfilename;
378 char *requirepass;
379 int shareobjects;
380 int rdbcompression;
381 /* Replication related */
382 int isslave;
383 char *masterauth;
384 char *masterhost;
385 int masterport;
386 redisClient *master; /* client that is master for this slave */
387 int replstate;
388 unsigned int maxclients;
389 unsigned long long maxmemory;
390 unsigned int blpop_blocked_clients;
391 unsigned int vm_blocked_clients;
392 /* Sort parameters - qsort_r() is only available under BSD so we
393 * have to take this state global, in order to pass it to sortCompare() */
394 int sort_desc;
395 int sort_alpha;
396 int sort_bypattern;
397 /* Virtual memory configuration */
398 int vm_enabled;
399 char *vm_swap_file;
400 off_t vm_page_size;
401 off_t vm_pages;
402 unsigned long long vm_max_memory;
403 /* Hashes config */
404 size_t hash_max_zipmap_entries;
405 size_t hash_max_zipmap_value;
406 /* Virtual memory state */
407 FILE *vm_fp;
408 int vm_fd;
409 off_t vm_next_page; /* Next probably empty page */
410 off_t vm_near_pages; /* Number of pages allocated sequentially */
411 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
412 time_t unixtime; /* Unix time sampled every second. */
413 /* Virtual memory I/O threads stuff */
414 /* An I/O thread process an element taken from the io_jobs queue and
415 * put the result of the operation in the io_done list. While the
416 * job is being processed, it's put on io_processing queue. */
417 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
418 list *io_processing; /* List of VM I/O jobs being processed */
419 list *io_processed; /* List of VM I/O jobs already processed */
420 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
421 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
422 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
423 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
424 pthread_attr_t io_threads_attr; /* attributes for threads creation */
425 int io_active_threads; /* Number of running I/O threads */
426 int vm_max_threads; /* Max number of I/O threads running at the same time */
427 /* Our main thread is blocked on the event loop, locking for sockets ready
428 * to be read or written, so when a threaded I/O operation is ready to be
429 * processed by the main thread, the I/O thread will use a unix pipe to
430 * awake the main thread. The followings are the two pipe FDs. */
431 int io_ready_pipe_read;
432 int io_ready_pipe_write;
433 /* Virtual memory stats */
434 unsigned long long vm_stats_used_pages;
435 unsigned long long vm_stats_swapped_objects;
436 unsigned long long vm_stats_swapouts;
437 unsigned long long vm_stats_swapins;
438 FILE *devnull;
439 };
440
441 typedef void redisCommandProc(redisClient *c);
442 struct redisCommand {
443 char *name;
444 redisCommandProc *proc;
445 int arity;
446 int flags;
447 /* Use a function to determine which keys need to be loaded
448 * in the background prior to executing this command. Takes precedence
449 * over vm_firstkey and others, ignored when NULL */
450 redisCommandProc *vm_preload_proc;
451 /* What keys should be loaded in background when calling this command? */
452 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
453 int vm_lastkey; /* THe last argument that's a key */
454 int vm_keystep; /* The step between first and last key */
455 };
456
457 struct redisFunctionSym {
458 char *name;
459 unsigned long pointer;
460 };
461
462 typedef struct _redisSortObject {
463 robj *obj;
464 union {
465 double score;
466 robj *cmpobj;
467 } u;
468 } redisSortObject;
469
470 typedef struct _redisSortOperation {
471 int type;
472 robj *pattern;
473 } redisSortOperation;
474
475 /* ZSETs use a specialized version of Skiplists */
476
477 typedef struct zskiplistNode {
478 struct zskiplistNode **forward;
479 struct zskiplistNode *backward;
480 unsigned int *span;
481 double score;
482 robj *obj;
483 } zskiplistNode;
484
485 typedef struct zskiplist {
486 struct zskiplistNode *header, *tail;
487 unsigned long length;
488 int level;
489 } zskiplist;
490
491 typedef struct zset {
492 dict *dict;
493 zskiplist *zsl;
494 } zset;
495
496 /* Our shared "common" objects */
497
498 struct sharedObjectsStruct {
499 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
500 *colon, *nullbulk, *nullmultibulk, *queued,
501 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
502 *outofrangeerr, *plus,
503 *select0, *select1, *select2, *select3, *select4,
504 *select5, *select6, *select7, *select8, *select9;
505 } shared;
506
507 /* Global vars that are actally used as constants. The following double
508 * values are used for double on-disk serialization, and are initialized
509 * at runtime to avoid strange compiler optimizations. */
510
511 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
512
513 /* VM threaded I/O request message */
514 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
515 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
516 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
517 typedef struct iojob {
518 int type; /* Request type, REDIS_IOJOB_* */
519 redisDb *db;/* Redis database */
520 robj *key; /* This I/O request is about swapping this key */
521 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
522 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
523 off_t page; /* Swap page where to read/write the object */
524 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
525 int canceled; /* True if this command was canceled by blocking side of VM */
526 pthread_t thread; /* ID of the thread processing this entry */
527 } iojob;
528
529 /*================================ Prototypes =============================== */
530
531 static void freeStringObject(robj *o);
532 static void freeListObject(robj *o);
533 static void freeSetObject(robj *o);
534 static void decrRefCount(void *o);
535 static robj *createObject(int type, void *ptr);
536 static void freeClient(redisClient *c);
537 static int rdbLoad(char *filename);
538 static void addReply(redisClient *c, robj *obj);
539 static void addReplySds(redisClient *c, sds s);
540 static void incrRefCount(robj *o);
541 static int rdbSaveBackground(char *filename);
542 static robj *createStringObject(char *ptr, size_t len);
543 static robj *dupStringObject(robj *o);
544 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
545 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
546 static int syncWithMaster(void);
547 static robj *tryObjectSharing(robj *o);
548 static int tryObjectEncoding(robj *o);
549 static robj *getDecodedObject(robj *o);
550 static int removeExpire(redisDb *db, robj *key);
551 static int expireIfNeeded(redisDb *db, robj *key);
552 static int deleteIfVolatile(redisDb *db, robj *key);
553 static int deleteIfSwapped(redisDb *db, robj *key);
554 static int deleteKey(redisDb *db, robj *key);
555 static time_t getExpire(redisDb *db, robj *key);
556 static int setExpire(redisDb *db, robj *key, time_t when);
557 static void updateSlavesWaitingBgsave(int bgsaveerr);
558 static void freeMemoryIfNeeded(void);
559 static int processCommand(redisClient *c);
560 static void setupSigSegvAction(void);
561 static void rdbRemoveTempFile(pid_t childpid);
562 static void aofRemoveTempFile(pid_t childpid);
563 static size_t stringObjectLen(robj *o);
564 static void processInputBuffer(redisClient *c);
565 static zskiplist *zslCreate(void);
566 static void zslFree(zskiplist *zsl);
567 static void zslInsert(zskiplist *zsl, double score, robj *obj);
568 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
569 static void initClientMultiState(redisClient *c);
570 static void freeClientMultiState(redisClient *c);
571 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
572 static void unblockClientWaitingData(redisClient *c);
573 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
574 static void vmInit(void);
575 static void vmMarkPagesFree(off_t page, off_t count);
576 static robj *vmLoadObject(robj *key);
577 static robj *vmPreviewObject(robj *key);
578 static int vmSwapOneObjectBlocking(void);
579 static int vmSwapOneObjectThreaded(void);
580 static int vmCanSwapOut(void);
581 static int tryFreeOneObjectFromFreelist(void);
582 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
583 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
584 static void vmCancelThreadedIOJob(robj *o);
585 static void lockThreadedIO(void);
586 static void unlockThreadedIO(void);
587 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
588 static void freeIOJob(iojob *j);
589 static void queueIOJob(iojob *j);
590 static int vmWriteObjectOnSwap(robj *o, off_t page);
591 static robj *vmReadObjectFromSwap(off_t page, int type);
592 static void waitEmptyIOJobsQueue(void);
593 static void vmReopenSwapFile(void);
594 static int vmFreePage(off_t page);
595 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
596 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
597 static int dontWaitForSwappedKey(redisClient *c, robj *key);
598 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
599 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
600 static struct redisCommand *lookupCommand(char *name);
601 static void call(redisClient *c, struct redisCommand *cmd);
602 static void resetClient(redisClient *c);
603 static void convertToRealHash(robj *o);
604
605 static void authCommand(redisClient *c);
606 static void pingCommand(redisClient *c);
607 static void echoCommand(redisClient *c);
608 static void setCommand(redisClient *c);
609 static void setnxCommand(redisClient *c);
610 static void getCommand(redisClient *c);
611 static void delCommand(redisClient *c);
612 static void existsCommand(redisClient *c);
613 static void incrCommand(redisClient *c);
614 static void decrCommand(redisClient *c);
615 static void incrbyCommand(redisClient *c);
616 static void decrbyCommand(redisClient *c);
617 static void selectCommand(redisClient *c);
618 static void randomkeyCommand(redisClient *c);
619 static void keysCommand(redisClient *c);
620 static void dbsizeCommand(redisClient *c);
621 static void lastsaveCommand(redisClient *c);
622 static void saveCommand(redisClient *c);
623 static void bgsaveCommand(redisClient *c);
624 static void bgrewriteaofCommand(redisClient *c);
625 static void shutdownCommand(redisClient *c);
626 static void moveCommand(redisClient *c);
627 static void renameCommand(redisClient *c);
628 static void renamenxCommand(redisClient *c);
629 static void lpushCommand(redisClient *c);
630 static void rpushCommand(redisClient *c);
631 static void lpopCommand(redisClient *c);
632 static void rpopCommand(redisClient *c);
633 static void llenCommand(redisClient *c);
634 static void lindexCommand(redisClient *c);
635 static void lrangeCommand(redisClient *c);
636 static void ltrimCommand(redisClient *c);
637 static void typeCommand(redisClient *c);
638 static void lsetCommand(redisClient *c);
639 static void saddCommand(redisClient *c);
640 static void sremCommand(redisClient *c);
641 static void smoveCommand(redisClient *c);
642 static void sismemberCommand(redisClient *c);
643 static void scardCommand(redisClient *c);
644 static void spopCommand(redisClient *c);
645 static void srandmemberCommand(redisClient *c);
646 static void sinterCommand(redisClient *c);
647 static void sinterstoreCommand(redisClient *c);
648 static void sunionCommand(redisClient *c);
649 static void sunionstoreCommand(redisClient *c);
650 static void sdiffCommand(redisClient *c);
651 static void sdiffstoreCommand(redisClient *c);
652 static void syncCommand(redisClient *c);
653 static void flushdbCommand(redisClient *c);
654 static void flushallCommand(redisClient *c);
655 static void sortCommand(redisClient *c);
656 static void lremCommand(redisClient *c);
657 static void rpoplpushcommand(redisClient *c);
658 static void infoCommand(redisClient *c);
659 static void mgetCommand(redisClient *c);
660 static void monitorCommand(redisClient *c);
661 static void expireCommand(redisClient *c);
662 static void expireatCommand(redisClient *c);
663 static void getsetCommand(redisClient *c);
664 static void ttlCommand(redisClient *c);
665 static void slaveofCommand(redisClient *c);
666 static void debugCommand(redisClient *c);
667 static void msetCommand(redisClient *c);
668 static void msetnxCommand(redisClient *c);
669 static void zaddCommand(redisClient *c);
670 static void zincrbyCommand(redisClient *c);
671 static void zrangeCommand(redisClient *c);
672 static void zrangebyscoreCommand(redisClient *c);
673 static void zcountCommand(redisClient *c);
674 static void zrevrangeCommand(redisClient *c);
675 static void zcardCommand(redisClient *c);
676 static void zremCommand(redisClient *c);
677 static void zscoreCommand(redisClient *c);
678 static void zremrangebyscoreCommand(redisClient *c);
679 static void multiCommand(redisClient *c);
680 static void execCommand(redisClient *c);
681 static void discardCommand(redisClient *c);
682 static void blpopCommand(redisClient *c);
683 static void brpopCommand(redisClient *c);
684 static void appendCommand(redisClient *c);
685 static void substrCommand(redisClient *c);
686 static void zrankCommand(redisClient *c);
687 static void zrevrankCommand(redisClient *c);
688 static void hsetCommand(redisClient *c);
689 static void hgetCommand(redisClient *c);
690 static void hdelCommand(redisClient *c);
691 static void hlenCommand(redisClient *c);
692 static void zremrangebyrankCommand(redisClient *c);
693 static void zunionCommand(redisClient *c);
694 static void zinterCommand(redisClient *c);
695 static void hkeysCommand(redisClient *c);
696 static void hvalsCommand(redisClient *c);
697 static void hgetallCommand(redisClient *c);
698 static void hexistsCommand(redisClient *c);
699 static void configCommand(redisClient *c);
700
701 /*================================= Globals ================================= */
702
703 /* Global vars */
704 static struct redisServer server; /* server global state */
705 static struct redisCommand cmdTable[] = {
706 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
707 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
708 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
709 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
710 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
711 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
712 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
713 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
714 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
715 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
716 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
717 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
718 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
719 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
720 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
721 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
722 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
723 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
724 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
725 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
726 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
727 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
728 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
729 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
730 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
731 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
732 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
733 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
734 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
735 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
737 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
738 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
739 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
740 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
741 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
742 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
745 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
746 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
749 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
750 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
758 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
759 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
760 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
761 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
766 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
770 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
771 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
772 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
773 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
777 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
778 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
779 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
780 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
781 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
782 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
783 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
784 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
785 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
786 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
787 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
788 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
790 {"exec",execCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
791 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
793 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
795 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
798 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
800 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
802 {NULL,NULL,0,0,NULL,0,0,0}
803 };
804
805 static void usage();
806
807 /*============================ Utility functions ============================ */
808
809 /* Glob-style pattern matching. */
810 static int stringmatchlen(const char *pattern, int patternLen,
811 const char *string, int stringLen, int nocase)
812 {
813 while(patternLen) {
814 switch(pattern[0]) {
815 case '*':
816 while (pattern[1] == '*') {
817 pattern++;
818 patternLen--;
819 }
820 if (patternLen == 1)
821 return 1; /* match */
822 while(stringLen) {
823 if (stringmatchlen(pattern+1, patternLen-1,
824 string, stringLen, nocase))
825 return 1; /* match */
826 string++;
827 stringLen--;
828 }
829 return 0; /* no match */
830 break;
831 case '?':
832 if (stringLen == 0)
833 return 0; /* no match */
834 string++;
835 stringLen--;
836 break;
837 case '[':
838 {
839 int not, match;
840
841 pattern++;
842 patternLen--;
843 not = pattern[0] == '^';
844 if (not) {
845 pattern++;
846 patternLen--;
847 }
848 match = 0;
849 while(1) {
850 if (pattern[0] == '\\') {
851 pattern++;
852 patternLen--;
853 if (pattern[0] == string[0])
854 match = 1;
855 } else if (pattern[0] == ']') {
856 break;
857 } else if (patternLen == 0) {
858 pattern--;
859 patternLen++;
860 break;
861 } else if (pattern[1] == '-' && patternLen >= 3) {
862 int start = pattern[0];
863 int end = pattern[2];
864 int c = string[0];
865 if (start > end) {
866 int t = start;
867 start = end;
868 end = t;
869 }
870 if (nocase) {
871 start = tolower(start);
872 end = tolower(end);
873 c = tolower(c);
874 }
875 pattern += 2;
876 patternLen -= 2;
877 if (c >= start && c <= end)
878 match = 1;
879 } else {
880 if (!nocase) {
881 if (pattern[0] == string[0])
882 match = 1;
883 } else {
884 if (tolower((int)pattern[0]) == tolower((int)string[0]))
885 match = 1;
886 }
887 }
888 pattern++;
889 patternLen--;
890 }
891 if (not)
892 match = !match;
893 if (!match)
894 return 0; /* no match */
895 string++;
896 stringLen--;
897 break;
898 }
899 case '\\':
900 if (patternLen >= 2) {
901 pattern++;
902 patternLen--;
903 }
904 /* fall through */
905 default:
906 if (!nocase) {
907 if (pattern[0] != string[0])
908 return 0; /* no match */
909 } else {
910 if (tolower((int)pattern[0]) != tolower((int)string[0]))
911 return 0; /* no match */
912 }
913 string++;
914 stringLen--;
915 break;
916 }
917 pattern++;
918 patternLen--;
919 if (stringLen == 0) {
920 while(*pattern == '*') {
921 pattern++;
922 patternLen--;
923 }
924 break;
925 }
926 }
927 if (patternLen == 0 && stringLen == 0)
928 return 1;
929 return 0;
930 }
931
932 static int stringmatch(const char *pattern, const char *string, int nocase) {
933 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
934 }
935
936 static void redisLog(int level, const char *fmt, ...) {
937 va_list ap;
938 FILE *fp;
939
940 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
941 if (!fp) return;
942
943 va_start(ap, fmt);
944 if (level >= server.verbosity) {
945 char *c = ".-*#";
946 char buf[64];
947 time_t now;
948
949 now = time(NULL);
950 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
951 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
952 vfprintf(fp, fmt, ap);
953 fprintf(fp,"\n");
954 fflush(fp);
955 }
956 va_end(ap);
957
958 if (server.logfile) fclose(fp);
959 }
960
961 /*====================== Hash table type implementation ==================== */
962
963 /* This is an hash table type that uses the SDS dynamic strings libary as
964 * keys and radis objects as values (objects can hold SDS strings,
965 * lists, sets). */
966
967 static void dictVanillaFree(void *privdata, void *val)
968 {
969 DICT_NOTUSED(privdata);
970 zfree(val);
971 }
972
973 static void dictListDestructor(void *privdata, void *val)
974 {
975 DICT_NOTUSED(privdata);
976 listRelease((list*)val);
977 }
978
979 static int sdsDictKeyCompare(void *privdata, const void *key1,
980 const void *key2)
981 {
982 int l1,l2;
983 DICT_NOTUSED(privdata);
984
985 l1 = sdslen((sds)key1);
986 l2 = sdslen((sds)key2);
987 if (l1 != l2) return 0;
988 return memcmp(key1, key2, l1) == 0;
989 }
990
991 static void dictRedisObjectDestructor(void *privdata, void *val)
992 {
993 DICT_NOTUSED(privdata);
994
995 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
996 decrRefCount(val);
997 }
998
999 static int dictObjKeyCompare(void *privdata, const void *key1,
1000 const void *key2)
1001 {
1002 const robj *o1 = key1, *o2 = key2;
1003 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1004 }
1005
1006 static unsigned int dictObjHash(const void *key) {
1007 const robj *o = key;
1008 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1009 }
1010
1011 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1012 const void *key2)
1013 {
1014 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1015 int cmp;
1016
1017 if (o1->encoding == REDIS_ENCODING_INT &&
1018 o2->encoding == REDIS_ENCODING_INT &&
1019 o1->ptr == o2->ptr) return 1;
1020
1021 o1 = getDecodedObject(o1);
1022 o2 = getDecodedObject(o2);
1023 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1024 decrRefCount(o1);
1025 decrRefCount(o2);
1026 return cmp;
1027 }
1028
1029 static unsigned int dictEncObjHash(const void *key) {
1030 robj *o = (robj*) key;
1031
1032 if (o->encoding == REDIS_ENCODING_RAW) {
1033 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1034 } else {
1035 if (o->encoding == REDIS_ENCODING_INT) {
1036 char buf[32];
1037 int len;
1038
1039 len = snprintf(buf,32,"%ld",(long)o->ptr);
1040 return dictGenHashFunction((unsigned char*)buf, len);
1041 } else {
1042 unsigned int hash;
1043
1044 o = getDecodedObject(o);
1045 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1046 decrRefCount(o);
1047 return hash;
1048 }
1049 }
1050 }
1051
1052 /* Sets type and expires */
1053 static dictType setDictType = {
1054 dictEncObjHash, /* hash function */
1055 NULL, /* key dup */
1056 NULL, /* val dup */
1057 dictEncObjKeyCompare, /* key compare */
1058 dictRedisObjectDestructor, /* key destructor */
1059 NULL /* val destructor */
1060 };
1061
1062 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1063 static dictType zsetDictType = {
1064 dictEncObjHash, /* hash function */
1065 NULL, /* key dup */
1066 NULL, /* val dup */
1067 dictEncObjKeyCompare, /* key compare */
1068 dictRedisObjectDestructor, /* key destructor */
1069 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1070 };
1071
1072 /* Db->dict */
1073 static dictType dbDictType = {
1074 dictObjHash, /* hash function */
1075 NULL, /* key dup */
1076 NULL, /* val dup */
1077 dictObjKeyCompare, /* key compare */
1078 dictRedisObjectDestructor, /* key destructor */
1079 dictRedisObjectDestructor /* val destructor */
1080 };
1081
1082 /* Db->expires */
1083 static dictType keyptrDictType = {
1084 dictObjHash, /* hash function */
1085 NULL, /* key dup */
1086 NULL, /* val dup */
1087 dictObjKeyCompare, /* key compare */
1088 dictRedisObjectDestructor, /* key destructor */
1089 NULL /* val destructor */
1090 };
1091
1092 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1093 static dictType hashDictType = {
1094 dictEncObjHash, /* hash function */
1095 NULL, /* key dup */
1096 NULL, /* val dup */
1097 dictEncObjKeyCompare, /* key compare */
1098 dictRedisObjectDestructor, /* key destructor */
1099 dictRedisObjectDestructor /* val destructor */
1100 };
1101
1102 /* Keylist hash table type has unencoded redis objects as keys and
1103 * lists as values. It's used for blocking operations (BLPOP) and to
1104 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1105 static dictType keylistDictType = {
1106 dictObjHash, /* hash function */
1107 NULL, /* key dup */
1108 NULL, /* val dup */
1109 dictObjKeyCompare, /* key compare */
1110 dictRedisObjectDestructor, /* key destructor */
1111 dictListDestructor /* val destructor */
1112 };
1113
1114 static void version();
1115
1116 /* ========================= Random utility functions ======================= */
1117
1118 /* Redis generally does not try to recover from out of memory conditions
1119 * when allocating objects or strings, it is not clear if it will be possible
1120 * to report this condition to the client since the networking layer itself
1121 * is based on heap allocation for send buffers, so we simply abort.
1122 * At least the code will be simpler to read... */
1123 static void oom(const char *msg) {
1124 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1125 sleep(1);
1126 abort();
1127 }
1128
1129 /* ====================== Redis server networking stuff ===================== */
1130 static void closeTimedoutClients(void) {
1131 redisClient *c;
1132 listNode *ln;
1133 time_t now = time(NULL);
1134 listIter li;
1135
1136 listRewind(server.clients,&li);
1137 while ((ln = listNext(&li)) != NULL) {
1138 c = listNodeValue(ln);
1139 if (server.maxidletime &&
1140 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1141 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1142 (now - c->lastinteraction > server.maxidletime))
1143 {
1144 redisLog(REDIS_VERBOSE,"Closing idle client");
1145 freeClient(c);
1146 } else if (c->flags & REDIS_BLOCKED) {
1147 if (c->blockingto != 0 && c->blockingto < now) {
1148 addReply(c,shared.nullmultibulk);
1149 unblockClientWaitingData(c);
1150 }
1151 }
1152 }
1153 }
1154
1155 static int htNeedsResize(dict *dict) {
1156 long long size, used;
1157
1158 size = dictSlots(dict);
1159 used = dictSize(dict);
1160 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1161 (used*100/size < REDIS_HT_MINFILL));
1162 }
1163
1164 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1165 * we resize the hash table to save memory */
1166 static void tryResizeHashTables(void) {
1167 int j;
1168
1169 for (j = 0; j < server.dbnum; j++) {
1170 if (htNeedsResize(server.db[j].dict)) {
1171 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1172 dictResize(server.db[j].dict);
1173 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1174 }
1175 if (htNeedsResize(server.db[j].expires))
1176 dictResize(server.db[j].expires);
1177 }
1178 }
1179
1180 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1181 void backgroundSaveDoneHandler(int statloc) {
1182 int exitcode = WEXITSTATUS(statloc);
1183 int bysignal = WIFSIGNALED(statloc);
1184
1185 if (!bysignal && exitcode == 0) {
1186 redisLog(REDIS_NOTICE,
1187 "Background saving terminated with success");
1188 server.dirty = 0;
1189 server.lastsave = time(NULL);
1190 } else if (!bysignal && exitcode != 0) {
1191 redisLog(REDIS_WARNING, "Background saving error");
1192 } else {
1193 redisLog(REDIS_WARNING,
1194 "Background saving terminated by signal");
1195 rdbRemoveTempFile(server.bgsavechildpid);
1196 }
1197 server.bgsavechildpid = -1;
1198 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1199 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1200 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1201 }
1202
1203 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1204 * Handle this. */
1205 void backgroundRewriteDoneHandler(int statloc) {
1206 int exitcode = WEXITSTATUS(statloc);
1207 int bysignal = WIFSIGNALED(statloc);
1208
1209 if (!bysignal && exitcode == 0) {
1210 int fd;
1211 char tmpfile[256];
1212
1213 redisLog(REDIS_NOTICE,
1214 "Background append only file rewriting terminated with success");
1215 /* Now it's time to flush the differences accumulated by the parent */
1216 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1217 fd = open(tmpfile,O_WRONLY|O_APPEND);
1218 if (fd == -1) {
1219 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1220 goto cleanup;
1221 }
1222 /* Flush our data... */
1223 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1224 (signed) sdslen(server.bgrewritebuf)) {
1225 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1226 close(fd);
1227 goto cleanup;
1228 }
1229 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1230 /* Now our work is to rename the temp file into the stable file. And
1231 * switch the file descriptor used by the server for append only. */
1232 if (rename(tmpfile,server.appendfilename) == -1) {
1233 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1234 close(fd);
1235 goto cleanup;
1236 }
1237 /* Mission completed... almost */
1238 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1239 if (server.appendfd != -1) {
1240 /* If append only is actually enabled... */
1241 close(server.appendfd);
1242 server.appendfd = fd;
1243 fsync(fd);
1244 server.appendseldb = -1; /* Make sure it will issue SELECT */
1245 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1246 } else {
1247 /* If append only is disabled we just generate a dump in this
1248 * format. Why not? */
1249 close(fd);
1250 }
1251 } else if (!bysignal && exitcode != 0) {
1252 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1253 } else {
1254 redisLog(REDIS_WARNING,
1255 "Background append only file rewriting terminated by signal");
1256 }
1257 cleanup:
1258 sdsfree(server.bgrewritebuf);
1259 server.bgrewritebuf = sdsempty();
1260 aofRemoveTempFile(server.bgrewritechildpid);
1261 server.bgrewritechildpid = -1;
1262 }
1263
1264 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1265 int j, loops = server.cronloops++;
1266 REDIS_NOTUSED(eventLoop);
1267 REDIS_NOTUSED(id);
1268 REDIS_NOTUSED(clientData);
1269
1270 /* We take a cached value of the unix time in the global state because
1271 * with virtual memory and aging there is to store the current time
1272 * in objects at every object access, and accuracy is not needed.
1273 * To access a global var is faster than calling time(NULL) */
1274 server.unixtime = time(NULL);
1275
1276 /* Show some info about non-empty databases */
1277 for (j = 0; j < server.dbnum; j++) {
1278 long long size, used, vkeys;
1279
1280 size = dictSlots(server.db[j].dict);
1281 used = dictSize(server.db[j].dict);
1282 vkeys = dictSize(server.db[j].expires);
1283 if (!(loops % 50) && (used || vkeys)) {
1284 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1285 /* dictPrintStats(server.dict); */
1286 }
1287 }
1288
1289 /* We don't want to resize the hash tables while a bacground saving
1290 * is in progress: the saving child is created using fork() that is
1291 * implemented with a copy-on-write semantic in most modern systems, so
1292 * if we resize the HT while there is the saving child at work actually
1293 * a lot of memory movements in the parent will cause a lot of pages
1294 * copied. */
1295 if (server.bgsavechildpid == -1 && !(loops % 10)) tryResizeHashTables();
1296
1297 /* Show information about connected clients */
1298 if (!(loops % 50)) {
1299 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1300 listLength(server.clients)-listLength(server.slaves),
1301 listLength(server.slaves),
1302 zmalloc_used_memory(),
1303 dictSize(server.sharingpool));
1304 }
1305
1306 /* Close connections of timedout clients */
1307 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1308 closeTimedoutClients();
1309
1310 /* Check if a background saving or AOF rewrite in progress terminated */
1311 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1312 int statloc;
1313 pid_t pid;
1314
1315 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1316 if (pid == server.bgsavechildpid) {
1317 backgroundSaveDoneHandler(statloc);
1318 } else {
1319 backgroundRewriteDoneHandler(statloc);
1320 }
1321 }
1322 } else {
1323 /* If there is not a background saving in progress check if
1324 * we have to save now */
1325 time_t now = time(NULL);
1326 for (j = 0; j < server.saveparamslen; j++) {
1327 struct saveparam *sp = server.saveparams+j;
1328
1329 if (server.dirty >= sp->changes &&
1330 now-server.lastsave > sp->seconds) {
1331 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1332 sp->changes, sp->seconds);
1333 rdbSaveBackground(server.dbfilename);
1334 break;
1335 }
1336 }
1337 }
1338
1339 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1340 * will use few CPU cycles if there are few expiring keys, otherwise
1341 * it will get more aggressive to avoid that too much memory is used by
1342 * keys that can be removed from the keyspace. */
1343 for (j = 0; j < server.dbnum; j++) {
1344 int expired;
1345 redisDb *db = server.db+j;
1346
1347 /* Continue to expire if at the end of the cycle more than 25%
1348 * of the keys were expired. */
1349 do {
1350 long num = dictSize(db->expires);
1351 time_t now = time(NULL);
1352
1353 expired = 0;
1354 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1355 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1356 while (num--) {
1357 dictEntry *de;
1358 time_t t;
1359
1360 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1361 t = (time_t) dictGetEntryVal(de);
1362 if (now > t) {
1363 deleteKey(db,dictGetEntryKey(de));
1364 expired++;
1365 server.stat_expiredkeys++;
1366 }
1367 }
1368 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1369 }
1370
1371 /* Swap a few keys on disk if we are over the memory limit and VM
1372 * is enbled. Try to free objects from the free list first. */
1373 if (vmCanSwapOut()) {
1374 while (server.vm_enabled && zmalloc_used_memory() >
1375 server.vm_max_memory)
1376 {
1377 int retval;
1378
1379 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1380 retval = (server.vm_max_threads == 0) ?
1381 vmSwapOneObjectBlocking() :
1382 vmSwapOneObjectThreaded();
1383 if (retval == REDIS_ERR && !(loops % 300) &&
1384 zmalloc_used_memory() >
1385 (server.vm_max_memory+server.vm_max_memory/10))
1386 {
1387 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1388 }
1389 /* Note that when using threade I/O we free just one object,
1390 * because anyway when the I/O thread in charge to swap this
1391 * object out will finish, the handler of completed jobs
1392 * will try to swap more objects if we are still out of memory. */
1393 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1394 }
1395 }
1396
1397 /* Check if we should connect to a MASTER */
1398 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1399 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1400 if (syncWithMaster() == REDIS_OK) {
1401 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1402 }
1403 }
1404 return 100;
1405 }
1406
1407 /* This function gets called every time Redis is entering the
1408 * main loop of the event driven library, that is, before to sleep
1409 * for ready file descriptors. */
1410 static void beforeSleep(struct aeEventLoop *eventLoop) {
1411 REDIS_NOTUSED(eventLoop);
1412
1413 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1414 listIter li;
1415 listNode *ln;
1416
1417 listRewind(server.io_ready_clients,&li);
1418 while((ln = listNext(&li))) {
1419 redisClient *c = ln->value;
1420 struct redisCommand *cmd;
1421
1422 /* Resume the client. */
1423 listDelNode(server.io_ready_clients,ln);
1424 c->flags &= (~REDIS_IO_WAIT);
1425 server.vm_blocked_clients--;
1426 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1427 readQueryFromClient, c);
1428 cmd = lookupCommand(c->argv[0]->ptr);
1429 assert(cmd != NULL);
1430 call(c,cmd);
1431 resetClient(c);
1432 /* There may be more data to process in the input buffer. */
1433 if (c->querybuf && sdslen(c->querybuf) > 0)
1434 processInputBuffer(c);
1435 }
1436 }
1437 }
1438
1439 static void createSharedObjects(void) {
1440 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1441 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1442 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1443 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1444 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1445 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1446 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1447 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1448 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1449 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1450 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1451 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1452 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1453 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1454 "-ERR no such key\r\n"));
1455 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1456 "-ERR syntax error\r\n"));
1457 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1458 "-ERR source and destination objects are the same\r\n"));
1459 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1460 "-ERR index out of range\r\n"));
1461 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1462 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1463 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1464 shared.select0 = createStringObject("select 0\r\n",10);
1465 shared.select1 = createStringObject("select 1\r\n",10);
1466 shared.select2 = createStringObject("select 2\r\n",10);
1467 shared.select3 = createStringObject("select 3\r\n",10);
1468 shared.select4 = createStringObject("select 4\r\n",10);
1469 shared.select5 = createStringObject("select 5\r\n",10);
1470 shared.select6 = createStringObject("select 6\r\n",10);
1471 shared.select7 = createStringObject("select 7\r\n",10);
1472 shared.select8 = createStringObject("select 8\r\n",10);
1473 shared.select9 = createStringObject("select 9\r\n",10);
1474 }
1475
1476 static void appendServerSaveParams(time_t seconds, int changes) {
1477 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1478 server.saveparams[server.saveparamslen].seconds = seconds;
1479 server.saveparams[server.saveparamslen].changes = changes;
1480 server.saveparamslen++;
1481 }
1482
1483 static void resetServerSaveParams() {
1484 zfree(server.saveparams);
1485 server.saveparams = NULL;
1486 server.saveparamslen = 0;
1487 }
1488
1489 static void initServerConfig() {
1490 server.dbnum = REDIS_DEFAULT_DBNUM;
1491 server.port = REDIS_SERVERPORT;
1492 server.verbosity = REDIS_VERBOSE;
1493 server.maxidletime = REDIS_MAXIDLETIME;
1494 server.saveparams = NULL;
1495 server.logfile = NULL; /* NULL = log on standard output */
1496 server.bindaddr = NULL;
1497 server.glueoutputbuf = 1;
1498 server.daemonize = 0;
1499 server.appendonly = 0;
1500 server.appendfsync = APPENDFSYNC_ALWAYS;
1501 server.lastfsync = time(NULL);
1502 server.appendfd = -1;
1503 server.appendseldb = -1; /* Make sure the first time will not match */
1504 server.pidfile = zstrdup("/var/run/redis.pid");
1505 server.dbfilename = zstrdup("dump.rdb");
1506 server.appendfilename = zstrdup("appendonly.aof");
1507 server.requirepass = NULL;
1508 server.shareobjects = 0;
1509 server.rdbcompression = 1;
1510 server.sharingpoolsize = 1024;
1511 server.maxclients = 0;
1512 server.blpop_blocked_clients = 0;
1513 server.maxmemory = 0;
1514 server.vm_enabled = 0;
1515 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1516 server.vm_page_size = 256; /* 256 bytes per page */
1517 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1518 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1519 server.vm_max_threads = 4;
1520 server.vm_blocked_clients = 0;
1521 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1522 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1523
1524 resetServerSaveParams();
1525
1526 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1527 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1528 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1529 /* Replication related */
1530 server.isslave = 0;
1531 server.masterauth = NULL;
1532 server.masterhost = NULL;
1533 server.masterport = 6379;
1534 server.master = NULL;
1535 server.replstate = REDIS_REPL_NONE;
1536
1537 /* Double constants initialization */
1538 R_Zero = 0.0;
1539 R_PosInf = 1.0/R_Zero;
1540 R_NegInf = -1.0/R_Zero;
1541 R_Nan = R_Zero/R_Zero;
1542 }
1543
1544 static void initServer() {
1545 int j;
1546
1547 signal(SIGHUP, SIG_IGN);
1548 signal(SIGPIPE, SIG_IGN);
1549 setupSigSegvAction();
1550
1551 server.devnull = fopen("/dev/null","w");
1552 if (server.devnull == NULL) {
1553 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1554 exit(1);
1555 }
1556 server.clients = listCreate();
1557 server.slaves = listCreate();
1558 server.monitors = listCreate();
1559 server.objfreelist = listCreate();
1560 createSharedObjects();
1561 server.el = aeCreateEventLoop();
1562 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1563 server.sharingpool = dictCreate(&setDictType,NULL);
1564 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1565 if (server.fd == -1) {
1566 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1567 exit(1);
1568 }
1569 for (j = 0; j < server.dbnum; j++) {
1570 server.db[j].dict = dictCreate(&dbDictType,NULL);
1571 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1572 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1573 if (server.vm_enabled)
1574 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1575 server.db[j].id = j;
1576 }
1577 server.cronloops = 0;
1578 server.bgsavechildpid = -1;
1579 server.bgrewritechildpid = -1;
1580 server.bgrewritebuf = sdsempty();
1581 server.lastsave = time(NULL);
1582 server.dirty = 0;
1583 server.stat_numcommands = 0;
1584 server.stat_numconnections = 0;
1585 server.stat_expiredkeys = 0;
1586 server.stat_starttime = time(NULL);
1587 server.unixtime = time(NULL);
1588 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1589 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1590 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1591
1592 if (server.appendonly) {
1593 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1594 if (server.appendfd == -1) {
1595 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1596 strerror(errno));
1597 exit(1);
1598 }
1599 }
1600
1601 if (server.vm_enabled) vmInit();
1602 }
1603
1604 /* Empty the whole database */
1605 static long long emptyDb() {
1606 int j;
1607 long long removed = 0;
1608
1609 for (j = 0; j < server.dbnum; j++) {
1610 removed += dictSize(server.db[j].dict);
1611 dictEmpty(server.db[j].dict);
1612 dictEmpty(server.db[j].expires);
1613 }
1614 return removed;
1615 }
1616
1617 static int yesnotoi(char *s) {
1618 if (!strcasecmp(s,"yes")) return 1;
1619 else if (!strcasecmp(s,"no")) return 0;
1620 else return -1;
1621 }
1622
1623 /* I agree, this is a very rudimental way to load a configuration...
1624 will improve later if the config gets more complex */
1625 static void loadServerConfig(char *filename) {
1626 FILE *fp;
1627 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1628 int linenum = 0;
1629 sds line = NULL;
1630 char *errormsg = "Fatal error, can't open config file '%s'";
1631 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1632 sprintf(errorbuf, errormsg, filename);
1633
1634 if (filename[0] == '-' && filename[1] == '\0')
1635 fp = stdin;
1636 else {
1637 if ((fp = fopen(filename,"r")) == NULL) {
1638 redisLog(REDIS_WARNING, errorbuf);
1639 exit(1);
1640 }
1641 }
1642
1643 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1644 sds *argv;
1645 int argc, j;
1646
1647 linenum++;
1648 line = sdsnew(buf);
1649 line = sdstrim(line," \t\r\n");
1650
1651 /* Skip comments and blank lines*/
1652 if (line[0] == '#' || line[0] == '\0') {
1653 sdsfree(line);
1654 continue;
1655 }
1656
1657 /* Split into arguments */
1658 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1659 sdstolower(argv[0]);
1660
1661 /* Execute config directives */
1662 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1663 server.maxidletime = atoi(argv[1]);
1664 if (server.maxidletime < 0) {
1665 err = "Invalid timeout value"; goto loaderr;
1666 }
1667 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1668 server.port = atoi(argv[1]);
1669 if (server.port < 1 || server.port > 65535) {
1670 err = "Invalid port"; goto loaderr;
1671 }
1672 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1673 server.bindaddr = zstrdup(argv[1]);
1674 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1675 int seconds = atoi(argv[1]);
1676 int changes = atoi(argv[2]);
1677 if (seconds < 1 || changes < 0) {
1678 err = "Invalid save parameters"; goto loaderr;
1679 }
1680 appendServerSaveParams(seconds,changes);
1681 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1682 if (chdir(argv[1]) == -1) {
1683 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1684 argv[1], strerror(errno));
1685 exit(1);
1686 }
1687 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1688 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1689 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1690 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1691 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1692 else {
1693 err = "Invalid log level. Must be one of debug, notice, warning";
1694 goto loaderr;
1695 }
1696 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1697 FILE *logfp;
1698
1699 server.logfile = zstrdup(argv[1]);
1700 if (!strcasecmp(server.logfile,"stdout")) {
1701 zfree(server.logfile);
1702 server.logfile = NULL;
1703 }
1704 if (server.logfile) {
1705 /* Test if we are able to open the file. The server will not
1706 * be able to abort just for this problem later... */
1707 logfp = fopen(server.logfile,"a");
1708 if (logfp == NULL) {
1709 err = sdscatprintf(sdsempty(),
1710 "Can't open the log file: %s", strerror(errno));
1711 goto loaderr;
1712 }
1713 fclose(logfp);
1714 }
1715 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1716 server.dbnum = atoi(argv[1]);
1717 if (server.dbnum < 1) {
1718 err = "Invalid number of databases"; goto loaderr;
1719 }
1720 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1721 loadServerConfig(argv[1]);
1722 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1723 server.maxclients = atoi(argv[1]);
1724 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1725 server.maxmemory = strtoll(argv[1], NULL, 10);
1726 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1727 server.masterhost = sdsnew(argv[1]);
1728 server.masterport = atoi(argv[2]);
1729 server.replstate = REDIS_REPL_CONNECT;
1730 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1731 server.masterauth = zstrdup(argv[1]);
1732 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1733 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1734 err = "argument must be 'yes' or 'no'"; goto loaderr;
1735 }
1736 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1737 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1738 err = "argument must be 'yes' or 'no'"; goto loaderr;
1739 }
1740 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1741 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1742 err = "argument must be 'yes' or 'no'"; goto loaderr;
1743 }
1744 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1745 server.sharingpoolsize = atoi(argv[1]);
1746 if (server.sharingpoolsize < 1) {
1747 err = "invalid object sharing pool size"; goto loaderr;
1748 }
1749 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1750 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1751 err = "argument must be 'yes' or 'no'"; goto loaderr;
1752 }
1753 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1754 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1755 err = "argument must be 'yes' or 'no'"; goto loaderr;
1756 }
1757 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1758 if (!strcasecmp(argv[1],"no")) {
1759 server.appendfsync = APPENDFSYNC_NO;
1760 } else if (!strcasecmp(argv[1],"always")) {
1761 server.appendfsync = APPENDFSYNC_ALWAYS;
1762 } else if (!strcasecmp(argv[1],"everysec")) {
1763 server.appendfsync = APPENDFSYNC_EVERYSEC;
1764 } else {
1765 err = "argument must be 'no', 'always' or 'everysec'";
1766 goto loaderr;
1767 }
1768 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1769 server.requirepass = zstrdup(argv[1]);
1770 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1771 zfree(server.pidfile);
1772 server.pidfile = zstrdup(argv[1]);
1773 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1774 zfree(server.dbfilename);
1775 server.dbfilename = zstrdup(argv[1]);
1776 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1777 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1778 err = "argument must be 'yes' or 'no'"; goto loaderr;
1779 }
1780 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1781 zfree(server.vm_swap_file);
1782 server.vm_swap_file = zstrdup(argv[1]);
1783 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1784 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1785 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1786 server.vm_page_size = strtoll(argv[1], NULL, 10);
1787 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1788 server.vm_pages = strtoll(argv[1], NULL, 10);
1789 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1790 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1791 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1792 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1793 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1794 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1795 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1796 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1797 } else {
1798 err = "Bad directive or wrong number of arguments"; goto loaderr;
1799 }
1800 for (j = 0; j < argc; j++)
1801 sdsfree(argv[j]);
1802 zfree(argv);
1803 sdsfree(line);
1804 }
1805 if (fp != stdin) fclose(fp);
1806 return;
1807
1808 loaderr:
1809 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1810 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1811 fprintf(stderr, ">>> '%s'\n", line);
1812 fprintf(stderr, "%s\n", err);
1813 exit(1);
1814 }
1815
1816 static void freeClientArgv(redisClient *c) {
1817 int j;
1818
1819 for (j = 0; j < c->argc; j++)
1820 decrRefCount(c->argv[j]);
1821 for (j = 0; j < c->mbargc; j++)
1822 decrRefCount(c->mbargv[j]);
1823 c->argc = 0;
1824 c->mbargc = 0;
1825 }
1826
1827 static void freeClient(redisClient *c) {
1828 listNode *ln;
1829
1830 /* Note that if the client we are freeing is blocked into a blocking
1831 * call, we have to set querybuf to NULL *before* to call
1832 * unblockClientWaitingData() to avoid processInputBuffer() will get
1833 * called. Also it is important to remove the file events after
1834 * this, because this call adds the READABLE event. */
1835 sdsfree(c->querybuf);
1836 c->querybuf = NULL;
1837 if (c->flags & REDIS_BLOCKED)
1838 unblockClientWaitingData(c);
1839
1840 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1841 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1842 listRelease(c->reply);
1843 freeClientArgv(c);
1844 close(c->fd);
1845 /* Remove from the list of clients */
1846 ln = listSearchKey(server.clients,c);
1847 redisAssert(ln != NULL);
1848 listDelNode(server.clients,ln);
1849 /* Remove from the list of clients waiting for swapped keys */
1850 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1851 ln = listSearchKey(server.io_ready_clients,c);
1852 if (ln) {
1853 listDelNode(server.io_ready_clients,ln);
1854 server.vm_blocked_clients--;
1855 }
1856 }
1857 while (server.vm_enabled && listLength(c->io_keys)) {
1858 ln = listFirst(c->io_keys);
1859 dontWaitForSwappedKey(c,ln->value);
1860 }
1861 listRelease(c->io_keys);
1862 /* Other cleanup */
1863 if (c->flags & REDIS_SLAVE) {
1864 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1865 close(c->repldbfd);
1866 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1867 ln = listSearchKey(l,c);
1868 redisAssert(ln != NULL);
1869 listDelNode(l,ln);
1870 }
1871 if (c->flags & REDIS_MASTER) {
1872 server.master = NULL;
1873 server.replstate = REDIS_REPL_CONNECT;
1874 }
1875 zfree(c->argv);
1876 zfree(c->mbargv);
1877 freeClientMultiState(c);
1878 zfree(c);
1879 }
1880
1881 #define GLUEREPLY_UP_TO (1024)
1882 static void glueReplyBuffersIfNeeded(redisClient *c) {
1883 int copylen = 0;
1884 char buf[GLUEREPLY_UP_TO];
1885 listNode *ln;
1886 listIter li;
1887 robj *o;
1888
1889 listRewind(c->reply,&li);
1890 while((ln = listNext(&li))) {
1891 int objlen;
1892
1893 o = ln->value;
1894 objlen = sdslen(o->ptr);
1895 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1896 memcpy(buf+copylen,o->ptr,objlen);
1897 copylen += objlen;
1898 listDelNode(c->reply,ln);
1899 } else {
1900 if (copylen == 0) return;
1901 break;
1902 }
1903 }
1904 /* Now the output buffer is empty, add the new single element */
1905 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1906 listAddNodeHead(c->reply,o);
1907 }
1908
1909 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1910 redisClient *c = privdata;
1911 int nwritten = 0, totwritten = 0, objlen;
1912 robj *o;
1913 REDIS_NOTUSED(el);
1914 REDIS_NOTUSED(mask);
1915
1916 /* Use writev() if we have enough buffers to send */
1917 if (!server.glueoutputbuf &&
1918 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1919 !(c->flags & REDIS_MASTER))
1920 {
1921 sendReplyToClientWritev(el, fd, privdata, mask);
1922 return;
1923 }
1924
1925 while(listLength(c->reply)) {
1926 if (server.glueoutputbuf && listLength(c->reply) > 1)
1927 glueReplyBuffersIfNeeded(c);
1928
1929 o = listNodeValue(listFirst(c->reply));
1930 objlen = sdslen(o->ptr);
1931
1932 if (objlen == 0) {
1933 listDelNode(c->reply,listFirst(c->reply));
1934 continue;
1935 }
1936
1937 if (c->flags & REDIS_MASTER) {
1938 /* Don't reply to a master */
1939 nwritten = objlen - c->sentlen;
1940 } else {
1941 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1942 if (nwritten <= 0) break;
1943 }
1944 c->sentlen += nwritten;
1945 totwritten += nwritten;
1946 /* If we fully sent the object on head go to the next one */
1947 if (c->sentlen == objlen) {
1948 listDelNode(c->reply,listFirst(c->reply));
1949 c->sentlen = 0;
1950 }
1951 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1952 * bytes, in a single threaded server it's a good idea to serve
1953 * other clients as well, even if a very large request comes from
1954 * super fast link that is always able to accept data (in real world
1955 * scenario think about 'KEYS *' against the loopback interfae) */
1956 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1957 }
1958 if (nwritten == -1) {
1959 if (errno == EAGAIN) {
1960 nwritten = 0;
1961 } else {
1962 redisLog(REDIS_VERBOSE,
1963 "Error writing to client: %s", strerror(errno));
1964 freeClient(c);
1965 return;
1966 }
1967 }
1968 if (totwritten > 0) c->lastinteraction = time(NULL);
1969 if (listLength(c->reply) == 0) {
1970 c->sentlen = 0;
1971 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1972 }
1973 }
1974
1975 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1976 {
1977 redisClient *c = privdata;
1978 int nwritten = 0, totwritten = 0, objlen, willwrite;
1979 robj *o;
1980 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1981 int offset, ion = 0;
1982 REDIS_NOTUSED(el);
1983 REDIS_NOTUSED(mask);
1984
1985 listNode *node;
1986 while (listLength(c->reply)) {
1987 offset = c->sentlen;
1988 ion = 0;
1989 willwrite = 0;
1990
1991 /* fill-in the iov[] array */
1992 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1993 o = listNodeValue(node);
1994 objlen = sdslen(o->ptr);
1995
1996 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1997 break;
1998
1999 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2000 break; /* no more iovecs */
2001
2002 iov[ion].iov_base = ((char*)o->ptr) + offset;
2003 iov[ion].iov_len = objlen - offset;
2004 willwrite += objlen - offset;
2005 offset = 0; /* just for the first item */
2006 ion++;
2007 }
2008
2009 if(willwrite == 0)
2010 break;
2011
2012 /* write all collected blocks at once */
2013 if((nwritten = writev(fd, iov, ion)) < 0) {
2014 if (errno != EAGAIN) {
2015 redisLog(REDIS_VERBOSE,
2016 "Error writing to client: %s", strerror(errno));
2017 freeClient(c);
2018 return;
2019 }
2020 break;
2021 }
2022
2023 totwritten += nwritten;
2024 offset = c->sentlen;
2025
2026 /* remove written robjs from c->reply */
2027 while (nwritten && listLength(c->reply)) {
2028 o = listNodeValue(listFirst(c->reply));
2029 objlen = sdslen(o->ptr);
2030
2031 if(nwritten >= objlen - offset) {
2032 listDelNode(c->reply, listFirst(c->reply));
2033 nwritten -= objlen - offset;
2034 c->sentlen = 0;
2035 } else {
2036 /* partial write */
2037 c->sentlen += nwritten;
2038 break;
2039 }
2040 offset = 0;
2041 }
2042 }
2043
2044 if (totwritten > 0)
2045 c->lastinteraction = time(NULL);
2046
2047 if (listLength(c->reply) == 0) {
2048 c->sentlen = 0;
2049 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2050 }
2051 }
2052
2053 static struct redisCommand *lookupCommand(char *name) {
2054 int j = 0;
2055 while(cmdTable[j].name != NULL) {
2056 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2057 j++;
2058 }
2059 return NULL;
2060 }
2061
2062 /* resetClient prepare the client to process the next command */
2063 static void resetClient(redisClient *c) {
2064 freeClientArgv(c);
2065 c->bulklen = -1;
2066 c->multibulk = 0;
2067 }
2068
2069 /* Call() is the core of Redis execution of a command */
2070 static void call(redisClient *c, struct redisCommand *cmd) {
2071 long long dirty;
2072
2073 dirty = server.dirty;
2074 cmd->proc(c);
2075 if (server.appendonly && server.dirty-dirty)
2076 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2077 if (server.dirty-dirty && listLength(server.slaves))
2078 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2079 if (listLength(server.monitors))
2080 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2081 server.stat_numcommands++;
2082 }
2083
2084 /* If this function gets called we already read a whole
2085 * command, argments are in the client argv/argc fields.
2086 * processCommand() execute the command or prepare the
2087 * server for a bulk read from the client.
2088 *
2089 * If 1 is returned the client is still alive and valid and
2090 * and other operations can be performed by the caller. Otherwise
2091 * if 0 is returned the client was destroied (i.e. after QUIT). */
2092 static int processCommand(redisClient *c) {
2093 struct redisCommand *cmd;
2094
2095 /* Free some memory if needed (maxmemory setting) */
2096 if (server.maxmemory) freeMemoryIfNeeded();
2097
2098 /* Handle the multi bulk command type. This is an alternative protocol
2099 * supported by Redis in order to receive commands that are composed of
2100 * multiple binary-safe "bulk" arguments. The latency of processing is
2101 * a bit higher but this allows things like multi-sets, so if this
2102 * protocol is used only for MSET and similar commands this is a big win. */
2103 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2104 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2105 if (c->multibulk <= 0) {
2106 resetClient(c);
2107 return 1;
2108 } else {
2109 decrRefCount(c->argv[c->argc-1]);
2110 c->argc--;
2111 return 1;
2112 }
2113 } else if (c->multibulk) {
2114 if (c->bulklen == -1) {
2115 if (((char*)c->argv[0]->ptr)[0] != '$') {
2116 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2117 resetClient(c);
2118 return 1;
2119 } else {
2120 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2121 decrRefCount(c->argv[0]);
2122 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2123 c->argc--;
2124 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2125 resetClient(c);
2126 return 1;
2127 }
2128 c->argc--;
2129 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2130 return 1;
2131 }
2132 } else {
2133 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2134 c->mbargv[c->mbargc] = c->argv[0];
2135 c->mbargc++;
2136 c->argc--;
2137 c->multibulk--;
2138 if (c->multibulk == 0) {
2139 robj **auxargv;
2140 int auxargc;
2141
2142 /* Here we need to swap the multi-bulk argc/argv with the
2143 * normal argc/argv of the client structure. */
2144 auxargv = c->argv;
2145 c->argv = c->mbargv;
2146 c->mbargv = auxargv;
2147
2148 auxargc = c->argc;
2149 c->argc = c->mbargc;
2150 c->mbargc = auxargc;
2151
2152 /* We need to set bulklen to something different than -1
2153 * in order for the code below to process the command without
2154 * to try to read the last argument of a bulk command as
2155 * a special argument. */
2156 c->bulklen = 0;
2157 /* continue below and process the command */
2158 } else {
2159 c->bulklen = -1;
2160 return 1;
2161 }
2162 }
2163 }
2164 /* -- end of multi bulk commands processing -- */
2165
2166 /* The QUIT command is handled as a special case. Normal command
2167 * procs are unable to close the client connection safely */
2168 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2169 freeClient(c);
2170 return 0;
2171 }
2172
2173 /* Now lookup the command and check ASAP about trivial error conditions
2174 * such wrong arity, bad command name and so forth. */
2175 cmd = lookupCommand(c->argv[0]->ptr);
2176 if (!cmd) {
2177 addReplySds(c,
2178 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2179 (char*)c->argv[0]->ptr));
2180 resetClient(c);
2181 return 1;
2182 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2183 (c->argc < -cmd->arity)) {
2184 addReplySds(c,
2185 sdscatprintf(sdsempty(),
2186 "-ERR wrong number of arguments for '%s' command\r\n",
2187 cmd->name));
2188 resetClient(c);
2189 return 1;
2190 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2191 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2192 resetClient(c);
2193 return 1;
2194 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2195 /* This is a bulk command, we have to read the last argument yet. */
2196 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2197
2198 decrRefCount(c->argv[c->argc-1]);
2199 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2200 c->argc--;
2201 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2202 resetClient(c);
2203 return 1;
2204 }
2205 c->argc--;
2206 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2207 /* It is possible that the bulk read is already in the
2208 * buffer. Check this condition and handle it accordingly.
2209 * This is just a fast path, alternative to call processInputBuffer().
2210 * It's a good idea since the code is small and this condition
2211 * happens most of the times. */
2212 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2213 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2214 c->argc++;
2215 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2216 } else {
2217 /* Otherwise return... there is to read the last argument
2218 * from the socket. */
2219 return 1;
2220 }
2221 }
2222 /* Let's try to share objects on the command arguments vector */
2223 if (server.shareobjects) {
2224 int j;
2225 for(j = 1; j < c->argc; j++)
2226 c->argv[j] = tryObjectSharing(c->argv[j]);
2227 }
2228 /* Let's try to encode the bulk object to save space. */
2229 if (cmd->flags & REDIS_CMD_BULK)
2230 tryObjectEncoding(c->argv[c->argc-1]);
2231
2232 /* Check if the user is authenticated */
2233 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2234 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2235 resetClient(c);
2236 return 1;
2237 }
2238
2239 /* Exec the command */
2240 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2241 queueMultiCommand(c,cmd);
2242 addReply(c,shared.queued);
2243 } else {
2244 if (server.vm_enabled && server.vm_max_threads > 0 &&
2245 blockClientOnSwappedKeys(cmd,c)) return 1;
2246 call(c,cmd);
2247 }
2248
2249 /* Prepare the client for the next command */
2250 resetClient(c);
2251 return 1;
2252 }
2253
2254 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2255 listNode *ln;
2256 listIter li;
2257 int outc = 0, j;
2258 robj **outv;
2259 /* (args*2)+1 is enough room for args, spaces, newlines */
2260 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2261
2262 if (argc <= REDIS_STATIC_ARGS) {
2263 outv = static_outv;
2264 } else {
2265 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2266 }
2267
2268 for (j = 0; j < argc; j++) {
2269 if (j != 0) outv[outc++] = shared.space;
2270 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2271 robj *lenobj;
2272
2273 lenobj = createObject(REDIS_STRING,
2274 sdscatprintf(sdsempty(),"%lu\r\n",
2275 (unsigned long) stringObjectLen(argv[j])));
2276 lenobj->refcount = 0;
2277 outv[outc++] = lenobj;
2278 }
2279 outv[outc++] = argv[j];
2280 }
2281 outv[outc++] = shared.crlf;
2282
2283 /* Increment all the refcounts at start and decrement at end in order to
2284 * be sure to free objects if there is no slave in a replication state
2285 * able to be feed with commands */
2286 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2287 listRewind(slaves,&li);
2288 while((ln = listNext(&li))) {
2289 redisClient *slave = ln->value;
2290
2291 /* Don't feed slaves that are still waiting for BGSAVE to start */
2292 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2293
2294 /* Feed all the other slaves, MONITORs and so on */
2295 if (slave->slaveseldb != dictid) {
2296 robj *selectcmd;
2297
2298 switch(dictid) {
2299 case 0: selectcmd = shared.select0; break;
2300 case 1: selectcmd = shared.select1; break;
2301 case 2: selectcmd = shared.select2; break;
2302 case 3: selectcmd = shared.select3; break;
2303 case 4: selectcmd = shared.select4; break;
2304 case 5: selectcmd = shared.select5; break;
2305 case 6: selectcmd = shared.select6; break;
2306 case 7: selectcmd = shared.select7; break;
2307 case 8: selectcmd = shared.select8; break;
2308 case 9: selectcmd = shared.select9; break;
2309 default:
2310 selectcmd = createObject(REDIS_STRING,
2311 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2312 selectcmd->refcount = 0;
2313 break;
2314 }
2315 addReply(slave,selectcmd);
2316 slave->slaveseldb = dictid;
2317 }
2318 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2319 }
2320 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2321 if (outv != static_outv) zfree(outv);
2322 }
2323
2324 static void processInputBuffer(redisClient *c) {
2325 again:
2326 /* Before to process the input buffer, make sure the client is not
2327 * waitig for a blocking operation such as BLPOP. Note that the first
2328 * iteration the client is never blocked, otherwise the processInputBuffer
2329 * would not be called at all, but after the execution of the first commands
2330 * in the input buffer the client may be blocked, and the "goto again"
2331 * will try to reiterate. The following line will make it return asap. */
2332 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2333 if (c->bulklen == -1) {
2334 /* Read the first line of the query */
2335 char *p = strchr(c->querybuf,'\n');
2336 size_t querylen;
2337
2338 if (p) {
2339 sds query, *argv;
2340 int argc, j;
2341
2342 query = c->querybuf;
2343 c->querybuf = sdsempty();
2344 querylen = 1+(p-(query));
2345 if (sdslen(query) > querylen) {
2346 /* leave data after the first line of the query in the buffer */
2347 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2348 }
2349 *p = '\0'; /* remove "\n" */
2350 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2351 sdsupdatelen(query);
2352
2353 /* Now we can split the query in arguments */
2354 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2355 sdsfree(query);
2356
2357 if (c->argv) zfree(c->argv);
2358 c->argv = zmalloc(sizeof(robj*)*argc);
2359
2360 for (j = 0; j < argc; j++) {
2361 if (sdslen(argv[j])) {
2362 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2363 c->argc++;
2364 } else {
2365 sdsfree(argv[j]);
2366 }
2367 }
2368 zfree(argv);
2369 if (c->argc) {
2370 /* Execute the command. If the client is still valid
2371 * after processCommand() return and there is something
2372 * on the query buffer try to process the next command. */
2373 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2374 } else {
2375 /* Nothing to process, argc == 0. Just process the query
2376 * buffer if it's not empty or return to the caller */
2377 if (sdslen(c->querybuf)) goto again;
2378 }
2379 return;
2380 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2381 redisLog(REDIS_VERBOSE, "Client protocol error");
2382 freeClient(c);
2383 return;
2384 }
2385 } else {
2386 /* Bulk read handling. Note that if we are at this point
2387 the client already sent a command terminated with a newline,
2388 we are reading the bulk data that is actually the last
2389 argument of the command. */
2390 int qbl = sdslen(c->querybuf);
2391
2392 if (c->bulklen <= qbl) {
2393 /* Copy everything but the final CRLF as final argument */
2394 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2395 c->argc++;
2396 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2397 /* Process the command. If the client is still valid after
2398 * the processing and there is more data in the buffer
2399 * try to parse it. */
2400 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2401 return;
2402 }
2403 }
2404 }
2405
2406 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2407 redisClient *c = (redisClient*) privdata;
2408 char buf[REDIS_IOBUF_LEN];
2409 int nread;
2410 REDIS_NOTUSED(el);
2411 REDIS_NOTUSED(mask);
2412
2413 nread = read(fd, buf, REDIS_IOBUF_LEN);
2414 if (nread == -1) {
2415 if (errno == EAGAIN) {
2416 nread = 0;
2417 } else {
2418 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2419 freeClient(c);
2420 return;
2421 }
2422 } else if (nread == 0) {
2423 redisLog(REDIS_VERBOSE, "Client closed connection");
2424 freeClient(c);
2425 return;
2426 }
2427 if (nread) {
2428 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2429 c->lastinteraction = time(NULL);
2430 } else {
2431 return;
2432 }
2433 if (!(c->flags & REDIS_BLOCKED))
2434 processInputBuffer(c);
2435 }
2436
2437 static int selectDb(redisClient *c, int id) {
2438 if (id < 0 || id >= server.dbnum)
2439 return REDIS_ERR;
2440 c->db = &server.db[id];
2441 return REDIS_OK;
2442 }
2443
2444 static void *dupClientReplyValue(void *o) {
2445 incrRefCount((robj*)o);
2446 return o;
2447 }
2448
2449 static redisClient *createClient(int fd) {
2450 redisClient *c = zmalloc(sizeof(*c));
2451
2452 anetNonBlock(NULL,fd);
2453 anetTcpNoDelay(NULL,fd);
2454 if (!c) return NULL;
2455 selectDb(c,0);
2456 c->fd = fd;
2457 c->querybuf = sdsempty();
2458 c->argc = 0;
2459 c->argv = NULL;
2460 c->bulklen = -1;
2461 c->multibulk = 0;
2462 c->mbargc = 0;
2463 c->mbargv = NULL;
2464 c->sentlen = 0;
2465 c->flags = 0;
2466 c->lastinteraction = time(NULL);
2467 c->authenticated = 0;
2468 c->replstate = REDIS_REPL_NONE;
2469 c->reply = listCreate();
2470 listSetFreeMethod(c->reply,decrRefCount);
2471 listSetDupMethod(c->reply,dupClientReplyValue);
2472 c->blockingkeys = NULL;
2473 c->blockingkeysnum = 0;
2474 c->io_keys = listCreate();
2475 listSetFreeMethod(c->io_keys,decrRefCount);
2476 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2477 readQueryFromClient, c) == AE_ERR) {
2478 freeClient(c);
2479 return NULL;
2480 }
2481 listAddNodeTail(server.clients,c);
2482 initClientMultiState(c);
2483 return c;
2484 }
2485
2486 static void addReply(redisClient *c, robj *obj) {
2487 if (listLength(c->reply) == 0 &&
2488 (c->replstate == REDIS_REPL_NONE ||
2489 c->replstate == REDIS_REPL_ONLINE) &&
2490 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2491 sendReplyToClient, c) == AE_ERR) return;
2492
2493 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2494 obj = dupStringObject(obj);
2495 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2496 }
2497 listAddNodeTail(c->reply,getDecodedObject(obj));
2498 }
2499
2500 static void addReplySds(redisClient *c, sds s) {
2501 robj *o = createObject(REDIS_STRING,s);
2502 addReply(c,o);
2503 decrRefCount(o);
2504 }
2505
2506 static void addReplyDouble(redisClient *c, double d) {
2507 char buf[128];
2508
2509 snprintf(buf,sizeof(buf),"%.17g",d);
2510 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2511 (unsigned long) strlen(buf),buf));
2512 }
2513
2514 static void addReplyLong(redisClient *c, long l) {
2515 char buf[128];
2516 size_t len;
2517
2518 if (l == 0) {
2519 addReply(c,shared.czero);
2520 return;
2521 } else if (l == 1) {
2522 addReply(c,shared.cone);
2523 return;
2524 }
2525 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2526 addReplySds(c,sdsnewlen(buf,len));
2527 }
2528
2529 static void addReplyUlong(redisClient *c, unsigned long ul) {
2530 char buf[128];
2531 size_t len;
2532
2533 if (ul == 0) {
2534 addReply(c,shared.czero);
2535 return;
2536 } else if (ul == 1) {
2537 addReply(c,shared.cone);
2538 return;
2539 }
2540 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2541 addReplySds(c,sdsnewlen(buf,len));
2542 }
2543
2544 static void addReplyBulkLen(redisClient *c, robj *obj) {
2545 size_t len;
2546
2547 if (obj->encoding == REDIS_ENCODING_RAW) {
2548 len = sdslen(obj->ptr);
2549 } else {
2550 long n = (long)obj->ptr;
2551
2552 /* Compute how many bytes will take this integer as a radix 10 string */
2553 len = 1;
2554 if (n < 0) {
2555 len++;
2556 n = -n;
2557 }
2558 while((n = n/10) != 0) {
2559 len++;
2560 }
2561 }
2562 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2563 }
2564
2565 static void addReplyBulk(redisClient *c, robj *obj) {
2566 addReplyBulkLen(c,obj);
2567 addReply(c,obj);
2568 addReply(c,shared.crlf);
2569 }
2570
2571 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2572 static void addReplyBulkCString(redisClient *c, char *s) {
2573 if (s == NULL) {
2574 addReply(c,shared.nullbulk);
2575 } else {
2576 robj *o = createStringObject(s,strlen(s));
2577 addReplyBulk(c,o);
2578 decrRefCount(o);
2579 }
2580 }
2581
2582 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2583 int cport, cfd;
2584 char cip[128];
2585 redisClient *c;
2586 REDIS_NOTUSED(el);
2587 REDIS_NOTUSED(mask);
2588 REDIS_NOTUSED(privdata);
2589
2590 cfd = anetAccept(server.neterr, fd, cip, &cport);
2591 if (cfd == AE_ERR) {
2592 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2593 return;
2594 }
2595 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2596 if ((c = createClient(cfd)) == NULL) {
2597 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2598 close(cfd); /* May be already closed, just ingore errors */
2599 return;
2600 }
2601 /* If maxclient directive is set and this is one client more... close the
2602 * connection. Note that we create the client instead to check before
2603 * for this condition, since now the socket is already set in nonblocking
2604 * mode and we can send an error for free using the Kernel I/O */
2605 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2606 char *err = "-ERR max number of clients reached\r\n";
2607
2608 /* That's a best effort error message, don't check write errors */
2609 if (write(c->fd,err,strlen(err)) == -1) {
2610 /* Nothing to do, Just to avoid the warning... */
2611 }
2612 freeClient(c);
2613 return;
2614 }
2615 server.stat_numconnections++;
2616 }
2617
2618 /* ======================= Redis objects implementation ===================== */
2619
2620 static robj *createObject(int type, void *ptr) {
2621 robj *o;
2622
2623 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2624 if (listLength(server.objfreelist)) {
2625 listNode *head = listFirst(server.objfreelist);
2626 o = listNodeValue(head);
2627 listDelNode(server.objfreelist,head);
2628 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2629 } else {
2630 if (server.vm_enabled) {
2631 pthread_mutex_unlock(&server.obj_freelist_mutex);
2632 o = zmalloc(sizeof(*o));
2633 } else {
2634 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2635 }
2636 }
2637 o->type = type;
2638 o->encoding = REDIS_ENCODING_RAW;
2639 o->ptr = ptr;
2640 o->refcount = 1;
2641 if (server.vm_enabled) {
2642 /* Note that this code may run in the context of an I/O thread
2643 * and accessing to server.unixtime in theory is an error
2644 * (no locks). But in practice this is safe, and even if we read
2645 * garbage Redis will not fail, as it's just a statistical info */
2646 o->vm.atime = server.unixtime;
2647 o->storage = REDIS_VM_MEMORY;
2648 }
2649 return o;
2650 }
2651
2652 static robj *createStringObject(char *ptr, size_t len) {
2653 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2654 }
2655
2656 static robj *dupStringObject(robj *o) {
2657 assert(o->encoding == REDIS_ENCODING_RAW);
2658 return createStringObject(o->ptr,sdslen(o->ptr));
2659 }
2660
2661 static robj *createListObject(void) {
2662 list *l = listCreate();
2663
2664 listSetFreeMethod(l,decrRefCount);
2665 return createObject(REDIS_LIST,l);
2666 }
2667
2668 static robj *createSetObject(void) {
2669 dict *d = dictCreate(&setDictType,NULL);
2670 return createObject(REDIS_SET,d);
2671 }
2672
2673 static robj *createHashObject(void) {
2674 /* All the Hashes start as zipmaps. Will be automatically converted
2675 * into hash tables if there are enough elements or big elements
2676 * inside. */
2677 unsigned char *zm = zipmapNew();
2678 robj *o = createObject(REDIS_HASH,zm);
2679 o->encoding = REDIS_ENCODING_ZIPMAP;
2680 return o;
2681 }
2682
2683 static robj *createZsetObject(void) {
2684 zset *zs = zmalloc(sizeof(*zs));
2685
2686 zs->dict = dictCreate(&zsetDictType,NULL);
2687 zs->zsl = zslCreate();
2688 return createObject(REDIS_ZSET,zs);
2689 }
2690
2691 static void freeStringObject(robj *o) {
2692 if (o->encoding == REDIS_ENCODING_RAW) {
2693 sdsfree(o->ptr);
2694 }
2695 }
2696
2697 static void freeListObject(robj *o) {
2698 listRelease((list*) o->ptr);
2699 }
2700
2701 static void freeSetObject(robj *o) {
2702 dictRelease((dict*) o->ptr);
2703 }
2704
2705 static void freeZsetObject(robj *o) {
2706 zset *zs = o->ptr;
2707
2708 dictRelease(zs->dict);
2709 zslFree(zs->zsl);
2710 zfree(zs);
2711 }
2712
2713 static void freeHashObject(robj *o) {
2714 switch (o->encoding) {
2715 case REDIS_ENCODING_HT:
2716 dictRelease((dict*) o->ptr);
2717 break;
2718 case REDIS_ENCODING_ZIPMAP:
2719 zfree(o->ptr);
2720 break;
2721 default:
2722 redisAssert(0);
2723 break;
2724 }
2725 }
2726
2727 static void incrRefCount(robj *o) {
2728 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2729 o->refcount++;
2730 }
2731
2732 static void decrRefCount(void *obj) {
2733 robj *o = obj;
2734
2735 /* Object is a key of a swapped out value, or in the process of being
2736 * loaded. */
2737 if (server.vm_enabled &&
2738 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2739 {
2740 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2741 redisAssert(o->refcount == 1);
2742 }
2743 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2744 redisAssert(o->type == REDIS_STRING);
2745 freeStringObject(o);
2746 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2747 pthread_mutex_lock(&server.obj_freelist_mutex);
2748 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2749 !listAddNodeHead(server.objfreelist,o))
2750 zfree(o);
2751 pthread_mutex_unlock(&server.obj_freelist_mutex);
2752 server.vm_stats_swapped_objects--;
2753 return;
2754 }
2755 /* Object is in memory, or in the process of being swapped out. */
2756 if (--(o->refcount) == 0) {
2757 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2758 vmCancelThreadedIOJob(obj);
2759 switch(o->type) {
2760 case REDIS_STRING: freeStringObject(o); break;
2761 case REDIS_LIST: freeListObject(o); break;
2762 case REDIS_SET: freeSetObject(o); break;
2763 case REDIS_ZSET: freeZsetObject(o); break;
2764 case REDIS_HASH: freeHashObject(o); break;
2765 default: redisAssert(0); break;
2766 }
2767 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2768 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2769 !listAddNodeHead(server.objfreelist,o))
2770 zfree(o);
2771 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2772 }
2773 }
2774
2775 static robj *lookupKey(redisDb *db, robj *key) {
2776 dictEntry *de = dictFind(db->dict,key);
2777 if (de) {
2778 robj *key = dictGetEntryKey(de);
2779 robj *val = dictGetEntryVal(de);
2780
2781 if (server.vm_enabled) {
2782 if (key->storage == REDIS_VM_MEMORY ||
2783 key->storage == REDIS_VM_SWAPPING)
2784 {
2785 /* If we were swapping the object out, stop it, this key
2786 * was requested. */
2787 if (key->storage == REDIS_VM_SWAPPING)
2788 vmCancelThreadedIOJob(key);
2789 /* Update the access time of the key for the aging algorithm. */
2790 key->vm.atime = server.unixtime;
2791 } else {
2792 int notify = (key->storage == REDIS_VM_LOADING);
2793
2794 /* Our value was swapped on disk. Bring it at home. */
2795 redisAssert(val == NULL);
2796 val = vmLoadObject(key);
2797 dictGetEntryVal(de) = val;
2798
2799 /* Clients blocked by the VM subsystem may be waiting for
2800 * this key... */
2801 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2802 }
2803 }
2804 return val;
2805 } else {
2806 return NULL;
2807 }
2808 }
2809
2810 static robj *lookupKeyRead(redisDb *db, robj *key) {
2811 expireIfNeeded(db,key);
2812 return lookupKey(db,key);
2813 }
2814
2815 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2816 deleteIfVolatile(db,key);
2817 return lookupKey(db,key);
2818 }
2819
2820 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2821 robj *o = lookupKeyRead(c->db, key);
2822 if (!o) addReply(c,reply);
2823 return o;
2824 }
2825
2826 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2827 robj *o = lookupKeyWrite(c->db, key);
2828 if (!o) addReply(c,reply);
2829 return o;
2830 }
2831
2832 static int checkType(redisClient *c, robj *o, int type) {
2833 if (o->type != type) {
2834 addReply(c,shared.wrongtypeerr);
2835 return 1;
2836 }
2837 return 0;
2838 }
2839
2840 static int deleteKey(redisDb *db, robj *key) {
2841 int retval;
2842
2843 /* We need to protect key from destruction: after the first dictDelete()
2844 * it may happen that 'key' is no longer valid if we don't increment
2845 * it's count. This may happen when we get the object reference directly
2846 * from the hash table with dictRandomKey() or dict iterators */
2847 incrRefCount(key);
2848 if (dictSize(db->expires)) dictDelete(db->expires,key);
2849 retval = dictDelete(db->dict,key);
2850 decrRefCount(key);
2851
2852 return retval == DICT_OK;
2853 }
2854
2855 /* Try to share an object against the shared objects pool */
2856 static robj *tryObjectSharing(robj *o) {
2857 struct dictEntry *de;
2858 unsigned long c;
2859
2860 if (o == NULL || server.shareobjects == 0) return o;
2861
2862 redisAssert(o->type == REDIS_STRING);
2863 de = dictFind(server.sharingpool,o);
2864 if (de) {
2865 robj *shared = dictGetEntryKey(de);
2866
2867 c = ((unsigned long) dictGetEntryVal(de))+1;
2868 dictGetEntryVal(de) = (void*) c;
2869 incrRefCount(shared);
2870 decrRefCount(o);
2871 return shared;
2872 } else {
2873 /* Here we are using a stream algorihtm: Every time an object is
2874 * shared we increment its count, everytime there is a miss we
2875 * recrement the counter of a random object. If this object reaches
2876 * zero we remove the object and put the current object instead. */
2877 if (dictSize(server.sharingpool) >=
2878 server.sharingpoolsize) {
2879 de = dictGetRandomKey(server.sharingpool);
2880 redisAssert(de != NULL);
2881 c = ((unsigned long) dictGetEntryVal(de))-1;
2882 dictGetEntryVal(de) = (void*) c;
2883 if (c == 0) {
2884 dictDelete(server.sharingpool,de->key);
2885 }
2886 } else {
2887 c = 0; /* If the pool is empty we want to add this object */
2888 }
2889 if (c == 0) {
2890 int retval;
2891
2892 retval = dictAdd(server.sharingpool,o,(void*)1);
2893 redisAssert(retval == DICT_OK);
2894 incrRefCount(o);
2895 }
2896 return o;
2897 }
2898 }
2899
2900 /* Check if the nul-terminated string 's' can be represented by a long
2901 * (that is, is a number that fits into long without any other space or
2902 * character before or after the digits).
2903 *
2904 * If so, the function returns REDIS_OK and *longval is set to the value
2905 * of the number. Otherwise REDIS_ERR is returned */
2906 static int isStringRepresentableAsLong(sds s, long *longval) {
2907 char buf[32], *endptr;
2908 long value;
2909 int slen;
2910
2911 value = strtol(s, &endptr, 10);
2912 if (endptr[0] != '\0') return REDIS_ERR;
2913 slen = snprintf(buf,32,"%ld",value);
2914
2915 /* If the number converted back into a string is not identical
2916 * then it's not possible to encode the string as integer */
2917 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2918 if (longval) *longval = value;
2919 return REDIS_OK;
2920 }
2921
2922 /* Try to encode a string object in order to save space */
2923 static int tryObjectEncoding(robj *o) {
2924 long value;
2925 sds s = o->ptr;
2926
2927 if (o->encoding != REDIS_ENCODING_RAW)
2928 return REDIS_ERR; /* Already encoded */
2929
2930 /* It's not save to encode shared objects: shared objects can be shared
2931 * everywhere in the "object space" of Redis. Encoded objects can only
2932 * appear as "values" (and not, for instance, as keys) */
2933 if (o->refcount > 1) return REDIS_ERR;
2934
2935 /* Currently we try to encode only strings */
2936 redisAssert(o->type == REDIS_STRING);
2937
2938 /* Check if we can represent this string as a long integer */
2939 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2940
2941 /* Ok, this object can be encoded */
2942 o->encoding = REDIS_ENCODING_INT;
2943 sdsfree(o->ptr);
2944 o->ptr = (void*) value;
2945 return REDIS_OK;
2946 }
2947
2948 /* Get a decoded version of an encoded object (returned as a new object).
2949 * If the object is already raw-encoded just increment the ref count. */
2950 static robj *getDecodedObject(robj *o) {
2951 robj *dec;
2952
2953 if (o->encoding == REDIS_ENCODING_RAW) {
2954 incrRefCount(o);
2955 return o;
2956 }
2957 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2958 char buf[32];
2959
2960 snprintf(buf,32,"%ld",(long)o->ptr);
2961 dec = createStringObject(buf,strlen(buf));
2962 return dec;
2963 } else {
2964 redisAssert(1 != 1);
2965 }
2966 }
2967
2968 /* Compare two string objects via strcmp() or alike.
2969 * Note that the objects may be integer-encoded. In such a case we
2970 * use snprintf() to get a string representation of the numbers on the stack
2971 * and compare the strings, it's much faster than calling getDecodedObject().
2972 *
2973 * Important note: if objects are not integer encoded, but binary-safe strings,
2974 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2975 * binary safe. */
2976 static int compareStringObjects(robj *a, robj *b) {
2977 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2978 char bufa[128], bufb[128], *astr, *bstr;
2979 int bothsds = 1;
2980
2981 if (a == b) return 0;
2982 if (a->encoding != REDIS_ENCODING_RAW) {
2983 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2984 astr = bufa;
2985 bothsds = 0;
2986 } else {
2987 astr = a->ptr;
2988 }
2989 if (b->encoding != REDIS_ENCODING_RAW) {
2990 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2991 bstr = bufb;
2992 bothsds = 0;
2993 } else {
2994 bstr = b->ptr;
2995 }
2996 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
2997 }
2998
2999 static size_t stringObjectLen(robj *o) {
3000 redisAssert(o->type == REDIS_STRING);
3001 if (o->encoding == REDIS_ENCODING_RAW) {
3002 return sdslen(o->ptr);
3003 } else {
3004 char buf[32];
3005
3006 return snprintf(buf,32,"%ld",(long)o->ptr);
3007 }
3008 }
3009
3010 /*============================ RDB saving/loading =========================== */
3011
3012 static int rdbSaveType(FILE *fp, unsigned char type) {
3013 if (fwrite(&type,1,1,fp) == 0) return -1;
3014 return 0;
3015 }
3016
3017 static int rdbSaveTime(FILE *fp, time_t t) {
3018 int32_t t32 = (int32_t) t;
3019 if (fwrite(&t32,4,1,fp) == 0) return -1;
3020 return 0;
3021 }
3022
3023 /* check rdbLoadLen() comments for more info */
3024 static int rdbSaveLen(FILE *fp, uint32_t len) {
3025 unsigned char buf[2];
3026
3027 if (len < (1<<6)) {
3028 /* Save a 6 bit len */
3029 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3030 if (fwrite(buf,1,1,fp) == 0) return -1;
3031 } else if (len < (1<<14)) {
3032 /* Save a 14 bit len */
3033 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3034 buf[1] = len&0xFF;
3035 if (fwrite(buf,2,1,fp) == 0) return -1;
3036 } else {
3037 /* Save a 32 bit len */
3038 buf[0] = (REDIS_RDB_32BITLEN<<6);
3039 if (fwrite(buf,1,1,fp) == 0) return -1;
3040 len = htonl(len);
3041 if (fwrite(&len,4,1,fp) == 0) return -1;
3042 }
3043 return 0;
3044 }
3045
3046 /* String objects in the form "2391" "-100" without any space and with a
3047 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3048 * encoded as integers to save space */
3049 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3050 long long value;
3051 char *endptr, buf[32];
3052
3053 /* Check if it's possible to encode this value as a number */
3054 value = strtoll(s, &endptr, 10);
3055 if (endptr[0] != '\0') return 0;
3056 snprintf(buf,32,"%lld",value);
3057
3058 /* If the number converted back into a string is not identical
3059 * then it's not possible to encode the string as integer */
3060 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3061
3062 /* Finally check if it fits in our ranges */
3063 if (value >= -(1<<7) && value <= (1<<7)-1) {
3064 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3065 enc[1] = value&0xFF;
3066 return 2;
3067 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3068 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3069 enc[1] = value&0xFF;
3070 enc[2] = (value>>8)&0xFF;
3071 return 3;
3072 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3073 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3074 enc[1] = value&0xFF;
3075 enc[2] = (value>>8)&0xFF;
3076 enc[3] = (value>>16)&0xFF;
3077 enc[4] = (value>>24)&0xFF;
3078 return 5;
3079 } else {
3080 return 0;
3081 }
3082 }
3083
3084 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3085 size_t comprlen, outlen;
3086 unsigned char byte;
3087 void *out;
3088
3089 /* We require at least four bytes compression for this to be worth it */
3090 if (len <= 4) return 0;
3091 outlen = len-4;
3092 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3093 comprlen = lzf_compress(s, len, out, outlen);
3094 if (comprlen == 0) {
3095 zfree(out);
3096 return 0;
3097 }
3098 /* Data compressed! Let's save it on disk */
3099 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3100 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3101 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3102 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3103 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3104 zfree(out);
3105 return comprlen;
3106
3107 writeerr:
3108 zfree(out);
3109 return -1;
3110 }
3111
3112 /* Save a string objet as [len][data] on disk. If the object is a string
3113 * representation of an integer value we try to safe it in a special form */
3114 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3115 int enclen;
3116
3117 /* Try integer encoding */
3118 if (len <= 11) {
3119 unsigned char buf[5];
3120 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3121 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3122 return 0;
3123 }
3124 }
3125
3126 /* Try LZF compression - under 20 bytes it's unable to compress even
3127 * aaaaaaaaaaaaaaaaaa so skip it */
3128 if (server.rdbcompression && len > 20) {
3129 int retval;
3130
3131 retval = rdbSaveLzfStringObject(fp,s,len);
3132 if (retval == -1) return -1;
3133 if (retval > 0) return 0;
3134 /* retval == 0 means data can't be compressed, save the old way */
3135 }
3136
3137 /* Store verbatim */
3138 if (rdbSaveLen(fp,len) == -1) return -1;
3139 if (len && fwrite(s,len,1,fp) == 0) return -1;
3140 return 0;
3141 }
3142
3143 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3144 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3145 int retval;
3146
3147 /* Avoid incr/decr ref count business when possible.
3148 * This plays well with copy-on-write given that we are probably
3149 * in a child process (BGSAVE). Also this makes sure key objects
3150 * of swapped objects are not incRefCount-ed (an assert does not allow
3151 * this in order to avoid bugs) */
3152 if (obj->encoding != REDIS_ENCODING_RAW) {
3153 obj = getDecodedObject(obj);
3154 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3155 decrRefCount(obj);
3156 } else {
3157 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3158 }
3159 return retval;
3160 }
3161
3162 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3163 * 8 bit integer specifing the length of the representation.
3164 * This 8 bit integer has special values in order to specify the following
3165 * conditions:
3166 * 253: not a number
3167 * 254: + inf
3168 * 255: - inf
3169 */
3170 static int rdbSaveDoubleValue(FILE *fp, double val) {
3171 unsigned char buf[128];
3172 int len;
3173
3174 if (isnan(val)) {
3175 buf[0] = 253;
3176 len = 1;
3177 } else if (!isfinite(val)) {
3178 len = 1;
3179 buf[0] = (val < 0) ? 255 : 254;
3180 } else {
3181 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3182 buf[0] = strlen((char*)buf+1);
3183 len = buf[0]+1;
3184 }
3185 if (fwrite(buf,len,1,fp) == 0) return -1;
3186 return 0;
3187 }
3188
3189 /* Save a Redis object. */
3190 static int rdbSaveObject(FILE *fp, robj *o) {
3191 if (o->type == REDIS_STRING) {
3192 /* Save a string value */
3193 if (rdbSaveStringObject(fp,o) == -1) return -1;
3194 } else if (o->type == REDIS_LIST) {
3195 /* Save a list value */
3196 list *list = o->ptr;
3197 listIter li;
3198 listNode *ln;
3199
3200 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3201 listRewind(list,&li);
3202 while((ln = listNext(&li))) {
3203 robj *eleobj = listNodeValue(ln);
3204
3205 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3206 }
3207 } else if (o->type == REDIS_SET) {
3208 /* Save a set value */
3209 dict *set = o->ptr;
3210 dictIterator *di = dictGetIterator(set);
3211 dictEntry *de;
3212
3213 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3214 while((de = dictNext(di)) != NULL) {
3215 robj *eleobj = dictGetEntryKey(de);
3216
3217 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3218 }
3219 dictReleaseIterator(di);
3220 } else if (o->type == REDIS_ZSET) {
3221 /* Save a set value */
3222 zset *zs = o->ptr;
3223 dictIterator *di = dictGetIterator(zs->dict);
3224 dictEntry *de;
3225
3226 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3227 while((de = dictNext(di)) != NULL) {
3228 robj *eleobj = dictGetEntryKey(de);
3229 double *score = dictGetEntryVal(de);
3230
3231 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3232 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3233 }
3234 dictReleaseIterator(di);
3235 } else if (o->type == REDIS_HASH) {
3236 /* Save a hash value */
3237 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3238 unsigned char *p = zipmapRewind(o->ptr);
3239 unsigned int count = zipmapLen(o->ptr);
3240 unsigned char *key, *val;
3241 unsigned int klen, vlen;
3242
3243 if (rdbSaveLen(fp,count) == -1) return -1;
3244 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3245 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3246 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3247 }
3248 } else {
3249 dictIterator *di = dictGetIterator(o->ptr);
3250 dictEntry *de;
3251
3252 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3253 while((de = dictNext(di)) != NULL) {
3254 robj *key = dictGetEntryKey(de);
3255 robj *val = dictGetEntryVal(de);
3256
3257 if (rdbSaveStringObject(fp,key) == -1) return -1;
3258 if (rdbSaveStringObject(fp,val) == -1) return -1;
3259 }
3260 dictReleaseIterator(di);
3261 }
3262 } else {
3263 redisAssert(0);
3264 }
3265 return 0;
3266 }
3267
3268 /* Return the length the object will have on disk if saved with
3269 * the rdbSaveObject() function. Currently we use a trick to get
3270 * this length with very little changes to the code. In the future
3271 * we could switch to a faster solution. */
3272 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3273 if (fp == NULL) fp = server.devnull;
3274 rewind(fp);
3275 assert(rdbSaveObject(fp,o) != 1);
3276 return ftello(fp);
3277 }
3278
3279 /* Return the number of pages required to save this object in the swap file */
3280 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3281 off_t bytes = rdbSavedObjectLen(o,fp);
3282
3283 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3284 }
3285
3286 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3287 static int rdbSave(char *filename) {
3288 dictIterator *di = NULL;
3289 dictEntry *de;
3290 FILE *fp;
3291 char tmpfile[256];
3292 int j;
3293 time_t now = time(NULL);
3294
3295 /* Wait for I/O therads to terminate, just in case this is a
3296 * foreground-saving, to avoid seeking the swap file descriptor at the
3297 * same time. */
3298 if (server.vm_enabled)
3299 waitEmptyIOJobsQueue();
3300
3301 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3302 fp = fopen(tmpfile,"w");
3303 if (!fp) {
3304 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3305 return REDIS_ERR;
3306 }
3307 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3308 for (j = 0; j < server.dbnum; j++) {
3309 redisDb *db = server.db+j;
3310 dict *d = db->dict;
3311 if (dictSize(d) == 0) continue;
3312 di = dictGetIterator(d);
3313 if (!di) {
3314 fclose(fp);
3315 return REDIS_ERR;
3316 }
3317
3318 /* Write the SELECT DB opcode */
3319 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3320 if (rdbSaveLen(fp,j) == -1) goto werr;
3321
3322 /* Iterate this DB writing every entry */
3323 while((de = dictNext(di)) != NULL) {
3324 robj *key = dictGetEntryKey(de);
3325 robj *o = dictGetEntryVal(de);
3326 time_t expiretime = getExpire(db,key);
3327
3328 /* Save the expire time */
3329 if (expiretime != -1) {
3330 /* If this key is already expired skip it */
3331 if (expiretime < now) continue;
3332 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3333 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3334 }
3335 /* Save the key and associated value. This requires special
3336 * handling if the value is swapped out. */
3337 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3338 key->storage == REDIS_VM_SWAPPING) {
3339 /* Save type, key, value */
3340 if (rdbSaveType(fp,o->type) == -1) goto werr;
3341 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3342 if (rdbSaveObject(fp,o) == -1) goto werr;
3343 } else {
3344 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3345 robj *po;
3346 /* Get a preview of the object in memory */
3347 po = vmPreviewObject(key);
3348 /* Save type, key, value */
3349 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3350 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3351 if (rdbSaveObject(fp,po) == -1) goto werr;
3352 /* Remove the loaded object from memory */
3353 decrRefCount(po);
3354 }
3355 }
3356 dictReleaseIterator(di);
3357 }
3358 /* EOF opcode */
3359 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3360
3361 /* Make sure data will not remain on the OS's output buffers */
3362 fflush(fp);
3363 fsync(fileno(fp));
3364 fclose(fp);
3365
3366 /* Use RENAME to make sure the DB file is changed atomically only
3367 * if the generate DB file is ok. */
3368 if (rename(tmpfile,filename) == -1) {
3369 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3370 unlink(tmpfile);
3371 return REDIS_ERR;
3372 }
3373 redisLog(REDIS_NOTICE,"DB saved on disk");
3374 server.dirty = 0;
3375 server.lastsave = time(NULL);
3376 return REDIS_OK;
3377
3378 werr:
3379 fclose(fp);
3380 unlink(tmpfile);
3381 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3382 if (di) dictReleaseIterator(di);
3383 return REDIS_ERR;
3384 }
3385
3386 static int rdbSaveBackground(char *filename) {
3387 pid_t childpid;
3388
3389 if (server.bgsavechildpid != -1) return REDIS_ERR;
3390 if (server.vm_enabled) waitEmptyIOJobsQueue();
3391 if ((childpid = fork()) == 0) {
3392 /* Child */
3393 if (server.vm_enabled) vmReopenSwapFile();
3394 close(server.fd);
3395 if (rdbSave(filename) == REDIS_OK) {
3396 _exit(0);
3397 } else {
3398 _exit(1);
3399 }
3400 } else {
3401 /* Parent */
3402 if (childpid == -1) {
3403 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3404 strerror(errno));
3405 return REDIS_ERR;
3406 }
3407 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3408 server.bgsavechildpid = childpid;
3409 return REDIS_OK;
3410 }
3411 return REDIS_OK; /* unreached */
3412 }
3413
3414 static void rdbRemoveTempFile(pid_t childpid) {
3415 char tmpfile[256];
3416
3417 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3418 unlink(tmpfile);
3419 }
3420
3421 static int rdbLoadType(FILE *fp) {
3422 unsigned char type;
3423 if (fread(&type,1,1,fp) == 0) return -1;
3424 return type;
3425 }
3426
3427 static time_t rdbLoadTime(FILE *fp) {
3428 int32_t t32;
3429 if (fread(&t32,4,1,fp) == 0) return -1;
3430 return (time_t) t32;
3431 }
3432
3433 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3434 * of this file for a description of how this are stored on disk.
3435 *
3436 * isencoded is set to 1 if the readed length is not actually a length but
3437 * an "encoding type", check the above comments for more info */
3438 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3439 unsigned char buf[2];
3440 uint32_t len;
3441 int type;
3442
3443 if (isencoded) *isencoded = 0;
3444 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3445 type = (buf[0]&0xC0)>>6;
3446 if (type == REDIS_RDB_6BITLEN) {
3447 /* Read a 6 bit len */
3448 return buf[0]&0x3F;
3449 } else if (type == REDIS_RDB_ENCVAL) {
3450 /* Read a 6 bit len encoding type */
3451 if (isencoded) *isencoded = 1;
3452 return buf[0]&0x3F;
3453 } else if (type == REDIS_RDB_14BITLEN) {
3454 /* Read a 14 bit len */
3455 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3456 return ((buf[0]&0x3F)<<8)|buf[1];
3457 } else {
3458 /* Read a 32 bit len */
3459 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3460 return ntohl(len);
3461 }
3462 }
3463
3464 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3465 unsigned char enc[4];
3466 long long val;
3467
3468 if (enctype == REDIS_RDB_ENC_INT8) {
3469 if (fread(enc,1,1,fp) == 0) return NULL;
3470 val = (signed char)enc[0];
3471 } else if (enctype == REDIS_RDB_ENC_INT16) {
3472 uint16_t v;
3473 if (fread(enc,2,1,fp) == 0) return NULL;
3474 v = enc[0]|(enc[1]<<8);
3475 val = (int16_t)v;
3476 } else if (enctype == REDIS_RDB_ENC_INT32) {
3477 uint32_t v;
3478 if (fread(enc,4,1,fp) == 0) return NULL;
3479 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3480 val = (int32_t)v;
3481 } else {
3482 val = 0; /* anti-warning */
3483 redisAssert(0);
3484 }
3485 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3486 }
3487
3488 static robj *rdbLoadLzfStringObject(FILE*fp) {
3489 unsigned int len, clen;
3490 unsigned char *c = NULL;
3491 sds val = NULL;
3492
3493 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3494 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3495 if ((c = zmalloc(clen)) == NULL) goto err;
3496 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3497 if (fread(c,clen,1,fp) == 0) goto err;
3498 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3499 zfree(c);
3500 return createObject(REDIS_STRING,val);
3501 err:
3502 zfree(c);
3503 sdsfree(val);
3504 return NULL;
3505 }
3506
3507 static robj *rdbLoadStringObject(FILE*fp) {
3508 int isencoded;
3509 uint32_t len;
3510 sds val;
3511
3512 len = rdbLoadLen(fp,&isencoded);
3513 if (isencoded) {
3514 switch(len) {
3515 case REDIS_RDB_ENC_INT8:
3516 case REDIS_RDB_ENC_INT16:
3517 case REDIS_RDB_ENC_INT32:
3518 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3519 case REDIS_RDB_ENC_LZF:
3520 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3521 default:
3522 redisAssert(0);
3523 }
3524 }
3525
3526 if (len == REDIS_RDB_LENERR) return NULL;
3527 val = sdsnewlen(NULL,len);
3528 if (len && fread(val,len,1,fp) == 0) {
3529 sdsfree(val);
3530 return NULL;
3531 }
3532 return tryObjectSharing(createObject(REDIS_STRING,val));
3533 }
3534
3535 /* For information about double serialization check rdbSaveDoubleValue() */
3536 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3537 char buf[128];
3538 unsigned char len;
3539
3540 if (fread(&len,1,1,fp) == 0) return -1;
3541 switch(len) {
3542 case 255: *val = R_NegInf; return 0;
3543 case 254: *val = R_PosInf; return 0;
3544 case 253: *val = R_Nan; return 0;
3545 default:
3546 if (fread(buf,len,1,fp) == 0) return -1;
3547 buf[len] = '\0';
3548 sscanf(buf, "%lg", val);
3549 return 0;
3550 }
3551 }
3552
3553 /* Load a Redis object of the specified type from the specified file.
3554 * On success a newly allocated object is returned, otherwise NULL. */
3555 static robj *rdbLoadObject(int type, FILE *fp) {
3556 robj *o;
3557
3558 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3559 if (type == REDIS_STRING) {
3560 /* Read string value */
3561 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3562 tryObjectEncoding(o);
3563 } else if (type == REDIS_LIST || type == REDIS_SET) {
3564 /* Read list/set value */
3565 uint32_t listlen;
3566
3567 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3568 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3569 /* It's faster to expand the dict to the right size asap in order
3570 * to avoid rehashing */
3571 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3572 dictExpand(o->ptr,listlen);
3573 /* Load every single element of the list/set */
3574 while(listlen--) {
3575 robj *ele;
3576
3577 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3578 tryObjectEncoding(ele);
3579 if (type == REDIS_LIST) {
3580 listAddNodeTail((list*)o->ptr,ele);
3581 } else {
3582 dictAdd((dict*)o->ptr,ele,NULL);
3583 }
3584 }
3585 } else if (type == REDIS_ZSET) {
3586 /* Read list/set value */
3587 size_t zsetlen;
3588 zset *zs;
3589
3590 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3591 o = createZsetObject();
3592 zs = o->ptr;
3593 /* Load every single element of the list/set */
3594 while(zsetlen--) {
3595 robj *ele;
3596 double *score = zmalloc(sizeof(double));
3597
3598 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3599 tryObjectEncoding(ele);
3600 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3601 dictAdd(zs->dict,ele,score);
3602 zslInsert(zs->zsl,*score,ele);
3603 incrRefCount(ele); /* added to skiplist */
3604 }
3605 } else if (type == REDIS_HASH) {
3606 size_t hashlen;
3607
3608 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3609 o = createHashObject();
3610 /* Too many entries? Use an hash table. */
3611 if (hashlen > server.hash_max_zipmap_entries)
3612 convertToRealHash(o);
3613 /* Load every key/value, then set it into the zipmap or hash
3614 * table, as needed. */
3615 while(hashlen--) {
3616 robj *key, *val;
3617
3618 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3619 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3620 /* If we are using a zipmap and there are too big values
3621 * the object is converted to real hash table encoding. */
3622 if (o->encoding != REDIS_ENCODING_HT &&
3623 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3624 sdslen(val->ptr) > server.hash_max_zipmap_value))
3625 {
3626 convertToRealHash(o);
3627 }
3628
3629 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3630 unsigned char *zm = o->ptr;
3631
3632 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3633 val->ptr,sdslen(val->ptr),NULL);
3634 o->ptr = zm;
3635 decrRefCount(key);
3636 decrRefCount(val);
3637 } else {
3638 tryObjectEncoding(key);
3639 tryObjectEncoding(val);
3640 dictAdd((dict*)o->ptr,key,val);
3641 }
3642 }
3643 } else {
3644 redisAssert(0);
3645 }
3646 return o;
3647 }
3648
3649 static int rdbLoad(char *filename) {
3650 FILE *fp;
3651 robj *keyobj = NULL;
3652 uint32_t dbid;
3653 int type, retval, rdbver;
3654 dict *d = server.db[0].dict;
3655 redisDb *db = server.db+0;
3656 char buf[1024];
3657 time_t expiretime = -1, now = time(NULL);
3658 long long loadedkeys = 0;
3659
3660 fp = fopen(filename,"r");
3661 if (!fp) return REDIS_ERR;
3662 if (fread(buf,9,1,fp) == 0) goto eoferr;
3663 buf[9] = '\0';
3664 if (memcmp(buf,"REDIS",5) != 0) {
3665 fclose(fp);
3666 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3667 return REDIS_ERR;
3668 }
3669 rdbver = atoi(buf+5);
3670 if (rdbver != 1) {
3671 fclose(fp);
3672 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3673 return REDIS_ERR;
3674 }
3675 while(1) {
3676 robj *o;
3677
3678 /* Read type. */
3679 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3680 if (type == REDIS_EXPIRETIME) {
3681 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3682 /* We read the time so we need to read the object type again */
3683 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3684 }
3685 if (type == REDIS_EOF) break;
3686 /* Handle SELECT DB opcode as a special case */
3687 if (type == REDIS_SELECTDB) {
3688 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3689 goto eoferr;
3690 if (dbid >= (unsigned)server.dbnum) {
3691 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3692 exit(1);
3693 }
3694 db = server.db+dbid;
3695 d = db->dict;
3696 continue;
3697 }
3698 /* Read key */
3699 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3700 /* Read value */
3701 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3702 /* Add the new object in the hash table */
3703 retval = dictAdd(d,keyobj,o);
3704 if (retval == DICT_ERR) {
3705 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3706 exit(1);
3707 }
3708 /* Set the expire time if needed */
3709 if (expiretime != -1) {
3710 setExpire(db,keyobj,expiretime);
3711 /* Delete this key if already expired */
3712 if (expiretime < now) deleteKey(db,keyobj);
3713 expiretime = -1;
3714 }
3715 keyobj = o = NULL;
3716 /* Handle swapping while loading big datasets when VM is on */
3717 loadedkeys++;
3718 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3719 while (zmalloc_used_memory() > server.vm_max_memory) {
3720 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3721 }
3722 }
3723 }
3724 fclose(fp);
3725 return REDIS_OK;
3726
3727 eoferr: /* unexpected end of file is handled here with a fatal exit */
3728 if (keyobj) decrRefCount(keyobj);
3729 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3730 exit(1);
3731 return REDIS_ERR; /* Just to avoid warning */
3732 }
3733
3734 /*================================== Commands =============================== */
3735
3736 static void authCommand(redisClient *c) {
3737 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3738 c->authenticated = 1;
3739 addReply(c,shared.ok);
3740 } else {
3741 c->authenticated = 0;
3742 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3743 }
3744 }
3745
3746 static void pingCommand(redisClient *c) {
3747 addReply(c,shared.pong);
3748 }
3749
3750 static void echoCommand(redisClient *c) {
3751 addReplyBulk(c,c->argv[1]);
3752 }
3753
3754 /*=================================== Strings =============================== */
3755
3756 static void setGenericCommand(redisClient *c, int nx) {
3757 int retval;
3758
3759 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3760 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3761 if (retval == DICT_ERR) {
3762 if (!nx) {
3763 /* If the key is about a swapped value, we want a new key object
3764 * to overwrite the old. So we delete the old key in the database.
3765 * This will also make sure that swap pages about the old object
3766 * will be marked as free. */
3767 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3768 incrRefCount(c->argv[1]);
3769 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3770 incrRefCount(c->argv[2]);
3771 } else {
3772 addReply(c,shared.czero);
3773 return;
3774 }
3775 } else {
3776 incrRefCount(c->argv[1]);
3777 incrRefCount(c->argv[2]);
3778 }
3779 server.dirty++;
3780 removeExpire(c->db,c->argv[1]);
3781 addReply(c, nx ? shared.cone : shared.ok);
3782 }
3783
3784 static void setCommand(redisClient *c) {
3785 setGenericCommand(c,0);
3786 }
3787
3788 static void setnxCommand(redisClient *c) {
3789 setGenericCommand(c,1);
3790 }
3791
3792 static int getGenericCommand(redisClient *c) {
3793 robj *o;
3794
3795 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3796 return REDIS_OK;
3797
3798 if (o->type != REDIS_STRING) {
3799 addReply(c,shared.wrongtypeerr);
3800 return REDIS_ERR;
3801 } else {
3802 addReplyBulk(c,o);
3803 return REDIS_OK;
3804 }
3805 }
3806
3807 static void getCommand(redisClient *c) {
3808 getGenericCommand(c);
3809 }
3810
3811 static void getsetCommand(redisClient *c) {
3812 if (getGenericCommand(c) == REDIS_ERR) return;
3813 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3814 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3815 } else {
3816 incrRefCount(c->argv[1]);
3817 }
3818 incrRefCount(c->argv[2]);
3819 server.dirty++;
3820 removeExpire(c->db,c->argv[1]);
3821 }
3822
3823 static void mgetCommand(redisClient *c) {
3824 int j;
3825
3826 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3827 for (j = 1; j < c->argc; j++) {
3828 robj *o = lookupKeyRead(c->db,c->argv[j]);
3829 if (o == NULL) {
3830 addReply(c,shared.nullbulk);
3831 } else {
3832 if (o->type != REDIS_STRING) {
3833 addReply(c,shared.nullbulk);
3834 } else {
3835 addReplyBulk(c,o);
3836 }
3837 }
3838 }
3839 }
3840
3841 static void msetGenericCommand(redisClient *c, int nx) {
3842 int j, busykeys = 0;
3843
3844 if ((c->argc % 2) == 0) {
3845 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3846 return;
3847 }
3848 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3849 * set nothing at all if at least one already key exists. */
3850 if (nx) {
3851 for (j = 1; j < c->argc; j += 2) {
3852 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3853 busykeys++;
3854 }
3855 }
3856 }
3857 if (busykeys) {
3858 addReply(c, shared.czero);
3859 return;
3860 }
3861
3862 for (j = 1; j < c->argc; j += 2) {
3863 int retval;
3864
3865 tryObjectEncoding(c->argv[j+1]);
3866 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3867 if (retval == DICT_ERR) {
3868 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3869 incrRefCount(c->argv[j+1]);
3870 } else {
3871 incrRefCount(c->argv[j]);
3872 incrRefCount(c->argv[j+1]);
3873 }
3874 removeExpire(c->db,c->argv[j]);
3875 }
3876 server.dirty += (c->argc-1)/2;
3877 addReply(c, nx ? shared.cone : shared.ok);
3878 }
3879
3880 static void msetCommand(redisClient *c) {
3881 msetGenericCommand(c,0);
3882 }
3883
3884 static void msetnxCommand(redisClient *c) {
3885 msetGenericCommand(c,1);
3886 }
3887
3888 static void incrDecrCommand(redisClient *c, long long incr) {
3889 long long value;
3890 int retval;
3891 robj *o;
3892
3893 o = lookupKeyWrite(c->db,c->argv[1]);
3894 if (o == NULL) {
3895 value = 0;
3896 } else {
3897 if (o->type != REDIS_STRING) {
3898 value = 0;
3899 } else {
3900 char *eptr;
3901
3902 if (o->encoding == REDIS_ENCODING_RAW)
3903 value = strtoll(o->ptr, &eptr, 10);
3904 else if (o->encoding == REDIS_ENCODING_INT)
3905 value = (long)o->ptr;
3906 else
3907 redisAssert(1 != 1);
3908 }
3909 }
3910
3911 value += incr;
3912 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3913 tryObjectEncoding(o);
3914 retval = dictAdd(c->db->dict,c->argv[1],o);
3915 if (retval == DICT_ERR) {
3916 dictReplace(c->db->dict,c->argv[1],o);
3917 removeExpire(c->db,c->argv[1]);
3918 } else {
3919 incrRefCount(c->argv[1]);
3920 }
3921 server.dirty++;
3922 addReply(c,shared.colon);
3923 addReply(c,o);
3924 addReply(c,shared.crlf);
3925 }
3926
3927 static void incrCommand(redisClient *c) {
3928 incrDecrCommand(c,1);
3929 }
3930
3931 static void decrCommand(redisClient *c) {
3932 incrDecrCommand(c,-1);
3933 }
3934
3935 static void incrbyCommand(redisClient *c) {
3936 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3937 incrDecrCommand(c,incr);
3938 }
3939
3940 static void decrbyCommand(redisClient *c) {
3941 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3942 incrDecrCommand(c,-incr);
3943 }
3944
3945 static void appendCommand(redisClient *c) {
3946 int retval;
3947 size_t totlen;
3948 robj *o;
3949
3950 o = lookupKeyWrite(c->db,c->argv[1]);
3951 if (o == NULL) {
3952 /* Create the key */
3953 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3954 incrRefCount(c->argv[1]);
3955 incrRefCount(c->argv[2]);
3956 totlen = stringObjectLen(c->argv[2]);
3957 } else {
3958 dictEntry *de;
3959
3960 de = dictFind(c->db->dict,c->argv[1]);
3961 assert(de != NULL);
3962
3963 o = dictGetEntryVal(de);
3964 if (o->type != REDIS_STRING) {
3965 addReply(c,shared.wrongtypeerr);
3966 return;
3967 }
3968 /* If the object is specially encoded or shared we have to make
3969 * a copy */
3970 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3971 robj *decoded = getDecodedObject(o);
3972
3973 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3974 decrRefCount(decoded);
3975 dictReplace(c->db->dict,c->argv[1],o);
3976 }
3977 /* APPEND! */
3978 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3979 o->ptr = sdscatlen(o->ptr,
3980 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3981 } else {
3982 o->ptr = sdscatprintf(o->ptr, "%ld",
3983 (unsigned long) c->argv[2]->ptr);
3984 }
3985 totlen = sdslen(o->ptr);
3986 }
3987 server.dirty++;
3988 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3989 }
3990
3991 static void substrCommand(redisClient *c) {
3992 robj *o;
3993 long start = atoi(c->argv[2]->ptr);
3994 long end = atoi(c->argv[3]->ptr);
3995 size_t rangelen, strlen;
3996 sds range;
3997
3998 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
3999 checkType(c,o,REDIS_STRING)) return;
4000
4001 o = getDecodedObject(o);
4002 strlen = sdslen(o->ptr);
4003
4004 /* convert negative indexes */
4005 if (start < 0) start = strlen+start;
4006 if (end < 0) end = strlen+end;
4007 if (start < 0) start = 0;
4008 if (end < 0) end = 0;
4009
4010 /* indexes sanity checks */
4011 if (start > end || (size_t)start >= strlen) {
4012 /* Out of range start or start > end result in null reply */
4013 addReply(c,shared.nullbulk);
4014 decrRefCount(o);
4015 return;
4016 }
4017 if ((size_t)end >= strlen) end = strlen-1;
4018 rangelen = (end-start)+1;
4019
4020 /* Return the result */
4021 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4022 range = sdsnewlen((char*)o->ptr+start,rangelen);
4023 addReplySds(c,range);
4024 addReply(c,shared.crlf);
4025 decrRefCount(o);
4026 }
4027
4028 /* ========================= Type agnostic commands ========================= */
4029
4030 static void delCommand(redisClient *c) {
4031 int deleted = 0, j;
4032
4033 for (j = 1; j < c->argc; j++) {
4034 if (deleteKey(c->db,c->argv[j])) {
4035 server.dirty++;
4036 deleted++;
4037 }
4038 }
4039 addReplyLong(c,deleted);
4040 }
4041
4042 static void existsCommand(redisClient *c) {
4043 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4044 }
4045
4046 static void selectCommand(redisClient *c) {
4047 int id = atoi(c->argv[1]->ptr);
4048
4049 if (selectDb(c,id) == REDIS_ERR) {
4050 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4051 } else {
4052 addReply(c,shared.ok);
4053 }
4054 }
4055
4056 static void randomkeyCommand(redisClient *c) {
4057 dictEntry *de;
4058
4059 while(1) {
4060 de = dictGetRandomKey(c->db->dict);
4061 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4062 }
4063 if (de == NULL) {
4064 addReply(c,shared.plus);
4065 addReply(c,shared.crlf);
4066 } else {
4067 addReply(c,shared.plus);
4068 addReply(c,dictGetEntryKey(de));
4069 addReply(c,shared.crlf);
4070 }
4071 }
4072
4073 static void keysCommand(redisClient *c) {
4074 dictIterator *di;
4075 dictEntry *de;
4076 sds pattern = c->argv[1]->ptr;
4077 int plen = sdslen(pattern);
4078 unsigned long numkeys = 0;
4079 robj *lenobj = createObject(REDIS_STRING,NULL);
4080
4081 di = dictGetIterator(c->db->dict);
4082 addReply(c,lenobj);
4083 decrRefCount(lenobj);
4084 while((de = dictNext(di)) != NULL) {
4085 robj *keyobj = dictGetEntryKey(de);
4086
4087 sds key = keyobj->ptr;
4088 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4089 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4090 if (expireIfNeeded(c->db,keyobj) == 0) {
4091 addReplyBulk(c,keyobj);
4092 numkeys++;
4093 }
4094 }
4095 }
4096 dictReleaseIterator(di);
4097 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4098 }
4099
4100 static void dbsizeCommand(redisClient *c) {
4101 addReplySds(c,
4102 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4103 }
4104
4105 static void lastsaveCommand(redisClient *c) {
4106 addReplySds(c,
4107 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4108 }
4109
4110 static void typeCommand(redisClient *c) {
4111 robj *o;
4112 char *type;
4113
4114 o = lookupKeyRead(c->db,c->argv[1]);
4115 if (o == NULL) {
4116 type = "+none";
4117 } else {
4118 switch(o->type) {
4119 case REDIS_STRING: type = "+string"; break;
4120 case REDIS_LIST: type = "+list"; break;
4121 case REDIS_SET: type = "+set"; break;
4122 case REDIS_ZSET: type = "+zset"; break;
4123 case REDIS_HASH: type = "+hash"; break;
4124 default: type = "+unknown"; break;
4125 }
4126 }
4127 addReplySds(c,sdsnew(type));
4128 addReply(c,shared.crlf);
4129 }
4130
4131 static void saveCommand(redisClient *c) {
4132 if (server.bgsavechildpid != -1) {
4133 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4134 return;
4135 }
4136 if (rdbSave(server.dbfilename) == REDIS_OK) {
4137 addReply(c,shared.ok);
4138 } else {
4139 addReply(c,shared.err);
4140 }
4141 }
4142
4143 static void bgsaveCommand(redisClient *c) {
4144 if (server.bgsavechildpid != -1) {
4145 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4146 return;
4147 }
4148 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4149 char *status = "+Background saving started\r\n";
4150 addReplySds(c,sdsnew(status));
4151 } else {
4152 addReply(c,shared.err);
4153 }
4154 }
4155
4156 static void shutdownCommand(redisClient *c) {
4157 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4158 /* Kill the saving child if there is a background saving in progress.
4159 We want to avoid race conditions, for instance our saving child may
4160 overwrite the synchronous saving did by SHUTDOWN. */
4161 if (server.bgsavechildpid != -1) {
4162 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4163 kill(server.bgsavechildpid,SIGKILL);
4164 rdbRemoveTempFile(server.bgsavechildpid);
4165 }
4166 if (server.appendonly) {
4167 /* Append only file: fsync() the AOF and exit */
4168 fsync(server.appendfd);
4169 if (server.vm_enabled) unlink(server.vm_swap_file);
4170 exit(0);
4171 } else {
4172 /* Snapshotting. Perform a SYNC SAVE and exit */
4173 if (rdbSave(server.dbfilename) == REDIS_OK) {
4174 if (server.daemonize)
4175 unlink(server.pidfile);
4176 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4177 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4178 if (server.vm_enabled) unlink(server.vm_swap_file);
4179 exit(0);
4180 } else {
4181 /* Ooops.. error saving! The best we can do is to continue
4182 * operating. Note that if there was a background saving process,
4183 * in the next cron() Redis will be notified that the background
4184 * saving aborted, handling special stuff like slaves pending for
4185 * synchronization... */
4186 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4187 addReplySds(c,
4188 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4189 }
4190 }
4191 }
4192
4193 static void renameGenericCommand(redisClient *c, int nx) {
4194 robj *o;
4195
4196 /* To use the same key as src and dst is probably an error */
4197 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4198 addReply(c,shared.sameobjecterr);
4199 return;
4200 }
4201
4202 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4203 return;
4204
4205 incrRefCount(o);
4206 deleteIfVolatile(c->db,c->argv[2]);
4207 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4208 if (nx) {
4209 decrRefCount(o);
4210 addReply(c,shared.czero);
4211 return;
4212 }
4213 dictReplace(c->db->dict,c->argv[2],o);
4214 } else {
4215 incrRefCount(c->argv[2]);
4216 }
4217 deleteKey(c->db,c->argv[1]);
4218 server.dirty++;
4219 addReply(c,nx ? shared.cone : shared.ok);
4220 }
4221
4222 static void renameCommand(redisClient *c) {
4223 renameGenericCommand(c,0);
4224 }
4225
4226 static void renamenxCommand(redisClient *c) {
4227 renameGenericCommand(c,1);
4228 }
4229
4230 static void moveCommand(redisClient *c) {
4231 robj *o;
4232 redisDb *src, *dst;
4233 int srcid;
4234
4235 /* Obtain source and target DB pointers */
4236 src = c->db;
4237 srcid = c->db->id;
4238 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4239 addReply(c,shared.outofrangeerr);
4240 return;
4241 }
4242 dst = c->db;
4243 selectDb(c,srcid); /* Back to the source DB */
4244
4245 /* If the user is moving using as target the same
4246 * DB as the source DB it is probably an error. */
4247 if (src == dst) {
4248 addReply(c,shared.sameobjecterr);
4249 return;
4250 }
4251
4252 /* Check if the element exists and get a reference */
4253 o = lookupKeyWrite(c->db,c->argv[1]);
4254 if (!o) {
4255 addReply(c,shared.czero);
4256 return;
4257 }
4258
4259 /* Try to add the element to the target DB */
4260 deleteIfVolatile(dst,c->argv[1]);
4261 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4262 addReply(c,shared.czero);
4263 return;
4264 }
4265 incrRefCount(c->argv[1]);
4266 incrRefCount(o);
4267
4268 /* OK! key moved, free the entry in the source DB */
4269 deleteKey(src,c->argv[1]);
4270 server.dirty++;
4271 addReply(c,shared.cone);
4272 }
4273
4274 /* =================================== Lists ================================ */
4275 static void pushGenericCommand(redisClient *c, int where) {
4276 robj *lobj;
4277 list *list;
4278
4279 lobj = lookupKeyWrite(c->db,c->argv[1]);
4280 if (lobj == NULL) {
4281 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4282 addReply(c,shared.cone);
4283 return;
4284 }
4285 lobj = createListObject();
4286 list = lobj->ptr;
4287 if (where == REDIS_HEAD) {
4288 listAddNodeHead(list,c->argv[2]);
4289 } else {
4290 listAddNodeTail(list,c->argv[2]);
4291 }
4292 dictAdd(c->db->dict,c->argv[1],lobj);
4293 incrRefCount(c->argv[1]);
4294 incrRefCount(c->argv[2]);
4295 } else {
4296 if (lobj->type != REDIS_LIST) {
4297 addReply(c,shared.wrongtypeerr);
4298 return;
4299 }
4300 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4301 addReply(c,shared.cone);
4302 return;
4303 }
4304 list = lobj->ptr;
4305 if (where == REDIS_HEAD) {
4306 listAddNodeHead(list,c->argv[2]);
4307 } else {
4308 listAddNodeTail(list,c->argv[2]);
4309 }
4310 incrRefCount(c->argv[2]);
4311 }
4312 server.dirty++;
4313 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4314 }
4315
4316 static void lpushCommand(redisClient *c) {
4317 pushGenericCommand(c,REDIS_HEAD);
4318 }
4319
4320 static void rpushCommand(redisClient *c) {
4321 pushGenericCommand(c,REDIS_TAIL);
4322 }
4323
4324 static void llenCommand(redisClient *c) {
4325 robj *o;
4326 list *l;
4327
4328 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4329 checkType(c,o,REDIS_LIST)) return;
4330
4331 l = o->ptr;
4332 addReplyUlong(c,listLength(l));
4333 }
4334
4335 static void lindexCommand(redisClient *c) {
4336 robj *o;
4337 int index = atoi(c->argv[2]->ptr);
4338 list *list;
4339 listNode *ln;
4340
4341 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4342 checkType(c,o,REDIS_LIST)) return;
4343 list = o->ptr;
4344
4345 ln = listIndex(list, index);
4346 if (ln == NULL) {
4347 addReply(c,shared.nullbulk);
4348 } else {
4349 robj *ele = listNodeValue(ln);
4350 addReplyBulk(c,ele);
4351 }
4352 }
4353
4354 static void lsetCommand(redisClient *c) {
4355 robj *o;
4356 int index = atoi(c->argv[2]->ptr);
4357 list *list;
4358 listNode *ln;
4359
4360 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4361 checkType(c,o,REDIS_LIST)) return;
4362 list = o->ptr;
4363
4364 ln = listIndex(list, index);
4365 if (ln == NULL) {
4366 addReply(c,shared.outofrangeerr);
4367 } else {
4368 robj *ele = listNodeValue(ln);
4369
4370 decrRefCount(ele);
4371 listNodeValue(ln) = c->argv[3];
4372 incrRefCount(c->argv[3]);
4373 addReply(c,shared.ok);
4374 server.dirty++;
4375 }
4376 }
4377
4378 static void popGenericCommand(redisClient *c, int where) {
4379 robj *o;
4380 list *list;
4381 listNode *ln;
4382
4383 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4384 checkType(c,o,REDIS_LIST)) return;
4385 list = o->ptr;
4386
4387 if (where == REDIS_HEAD)
4388 ln = listFirst(list);
4389 else
4390 ln = listLast(list);
4391
4392 if (ln == NULL) {
4393 addReply(c,shared.nullbulk);
4394 } else {
4395 robj *ele = listNodeValue(ln);
4396 addReplyBulk(c,ele);
4397 listDelNode(list,ln);
4398 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4399 server.dirty++;
4400 }
4401 }
4402
4403 static void lpopCommand(redisClient *c) {
4404 popGenericCommand(c,REDIS_HEAD);
4405 }
4406
4407 static void rpopCommand(redisClient *c) {
4408 popGenericCommand(c,REDIS_TAIL);
4409 }
4410
4411 static void lrangeCommand(redisClient *c) {
4412 robj *o;
4413 int start = atoi(c->argv[2]->ptr);
4414 int end = atoi(c->argv[3]->ptr);
4415 int llen;
4416 int rangelen, j;
4417 list *list;
4418 listNode *ln;
4419 robj *ele;
4420
4421 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4422 checkType(c,o,REDIS_LIST)) return;
4423 list = o->ptr;
4424 llen = listLength(list);
4425
4426 /* convert negative indexes */
4427 if (start < 0) start = llen+start;
4428 if (end < 0) end = llen+end;
4429 if (start < 0) start = 0;
4430 if (end < 0) end = 0;
4431
4432 /* indexes sanity checks */
4433 if (start > end || start >= llen) {
4434 /* Out of range start or start > end result in empty list */
4435 addReply(c,shared.emptymultibulk);
4436 return;
4437 }
4438 if (end >= llen) end = llen-1;
4439 rangelen = (end-start)+1;
4440
4441 /* Return the result in form of a multi-bulk reply */
4442 ln = listIndex(list, start);
4443 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4444 for (j = 0; j < rangelen; j++) {
4445 ele = listNodeValue(ln);
4446 addReplyBulk(c,ele);
4447 ln = ln->next;
4448 }
4449 }
4450
4451 static void ltrimCommand(redisClient *c) {
4452 robj *o;
4453 int start = atoi(c->argv[2]->ptr);
4454 int end = atoi(c->argv[3]->ptr);
4455 int llen;
4456 int j, ltrim, rtrim;
4457 list *list;
4458 listNode *ln;
4459
4460 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4461 checkType(c,o,REDIS_LIST)) return;
4462 list = o->ptr;
4463 llen = listLength(list);
4464
4465 /* convert negative indexes */
4466 if (start < 0) start = llen+start;
4467 if (end < 0) end = llen+end;
4468 if (start < 0) start = 0;
4469 if (end < 0) end = 0;
4470
4471 /* indexes sanity checks */
4472 if (start > end || start >= llen) {
4473 /* Out of range start or start > end result in empty list */
4474 ltrim = llen;
4475 rtrim = 0;
4476 } else {
4477 if (end >= llen) end = llen-1;
4478 ltrim = start;
4479 rtrim = llen-end-1;
4480 }
4481
4482 /* Remove list elements to perform the trim */
4483 for (j = 0; j < ltrim; j++) {
4484 ln = listFirst(list);
4485 listDelNode(list,ln);
4486 }
4487 for (j = 0; j < rtrim; j++) {
4488 ln = listLast(list);
4489 listDelNode(list,ln);
4490 }
4491 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4492 server.dirty++;
4493 addReply(c,shared.ok);
4494 }
4495
4496 static void lremCommand(redisClient *c) {
4497 robj *o;
4498 list *list;
4499 listNode *ln, *next;
4500 int toremove = atoi(c->argv[2]->ptr);
4501 int removed = 0;
4502 int fromtail = 0;
4503
4504 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4505 checkType(c,o,REDIS_LIST)) return;
4506 list = o->ptr;
4507
4508 if (toremove < 0) {
4509 toremove = -toremove;
4510 fromtail = 1;
4511 }
4512 ln = fromtail ? list->tail : list->head;
4513 while (ln) {
4514 robj *ele = listNodeValue(ln);
4515
4516 next = fromtail ? ln->prev : ln->next;
4517 if (compareStringObjects(ele,c->argv[3]) == 0) {
4518 listDelNode(list,ln);
4519 server.dirty++;
4520 removed++;
4521 if (toremove && removed == toremove) break;
4522 }
4523 ln = next;
4524 }
4525 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4526 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4527 }
4528
4529 /* This is the semantic of this command:
4530 * RPOPLPUSH srclist dstlist:
4531 * IF LLEN(srclist) > 0
4532 * element = RPOP srclist
4533 * LPUSH dstlist element
4534 * RETURN element
4535 * ELSE
4536 * RETURN nil
4537 * END
4538 * END
4539 *
4540 * The idea is to be able to get an element from a list in a reliable way
4541 * since the element is not just returned but pushed against another list
4542 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4543 */
4544 static void rpoplpushcommand(redisClient *c) {
4545 robj *sobj;
4546 list *srclist;
4547 listNode *ln;
4548
4549 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4550 checkType(c,sobj,REDIS_LIST)) return;
4551 srclist = sobj->ptr;
4552 ln = listLast(srclist);
4553
4554 if (ln == NULL) {
4555 addReply(c,shared.nullbulk);
4556 } else {
4557 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4558 robj *ele = listNodeValue(ln);
4559 list *dstlist;
4560
4561 if (dobj && dobj->type != REDIS_LIST) {
4562 addReply(c,shared.wrongtypeerr);
4563 return;
4564 }
4565
4566 /* Add the element to the target list (unless it's directly
4567 * passed to some BLPOP-ing client */
4568 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4569 if (dobj == NULL) {
4570 /* Create the list if the key does not exist */
4571 dobj = createListObject();
4572 dictAdd(c->db->dict,c->argv[2],dobj);
4573 incrRefCount(c->argv[2]);
4574 }
4575 dstlist = dobj->ptr;
4576 listAddNodeHead(dstlist,ele);
4577 incrRefCount(ele);
4578 }
4579
4580 /* Send the element to the client as reply as well */
4581 addReplyBulk(c,ele);
4582
4583 /* Finally remove the element from the source list */
4584 listDelNode(srclist,ln);
4585 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4586 server.dirty++;
4587 }
4588 }
4589
4590 /* ==================================== Sets ================================ */
4591
4592 static void saddCommand(redisClient *c) {
4593 robj *set;
4594
4595 set = lookupKeyWrite(c->db,c->argv[1]);
4596 if (set == NULL) {
4597 set = createSetObject();
4598 dictAdd(c->db->dict,c->argv[1],set);
4599 incrRefCount(c->argv[1]);
4600 } else {
4601 if (set->type != REDIS_SET) {
4602 addReply(c,shared.wrongtypeerr);
4603 return;
4604 }
4605 }
4606 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4607 incrRefCount(c->argv[2]);
4608 server.dirty++;
4609 addReply(c,shared.cone);
4610 } else {
4611 addReply(c,shared.czero);
4612 }
4613 }
4614
4615 static void sremCommand(redisClient *c) {
4616 robj *set;
4617
4618 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4619 checkType(c,set,REDIS_SET)) return;
4620
4621 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4622 server.dirty++;
4623 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4624 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4625 addReply(c,shared.cone);
4626 } else {
4627 addReply(c,shared.czero);
4628 }
4629 }
4630
4631 static void smoveCommand(redisClient *c) {
4632 robj *srcset, *dstset;
4633
4634 srcset = lookupKeyWrite(c->db,c->argv[1]);
4635 dstset = lookupKeyWrite(c->db,c->argv[2]);
4636
4637 /* If the source key does not exist return 0, if it's of the wrong type
4638 * raise an error */
4639 if (srcset == NULL || srcset->type != REDIS_SET) {
4640 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4641 return;
4642 }
4643 /* Error if the destination key is not a set as well */
4644 if (dstset && dstset->type != REDIS_SET) {
4645 addReply(c,shared.wrongtypeerr);
4646 return;
4647 }
4648 /* Remove the element from the source set */
4649 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4650 /* Key not found in the src set! return zero */
4651 addReply(c,shared.czero);
4652 return;
4653 }
4654 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4655 deleteKey(c->db,c->argv[1]);
4656 server.dirty++;
4657 /* Add the element to the destination set */
4658 if (!dstset) {
4659 dstset = createSetObject();
4660 dictAdd(c->db->dict,c->argv[2],dstset);
4661 incrRefCount(c->argv[2]);
4662 }
4663 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4664 incrRefCount(c->argv[3]);
4665 addReply(c,shared.cone);
4666 }
4667
4668 static void sismemberCommand(redisClient *c) {
4669 robj *set;
4670
4671 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4672 checkType(c,set,REDIS_SET)) return;
4673
4674 if (dictFind(set->ptr,c->argv[2]))
4675 addReply(c,shared.cone);
4676 else
4677 addReply(c,shared.czero);
4678 }
4679
4680 static void scardCommand(redisClient *c) {
4681 robj *o;
4682 dict *s;
4683
4684 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4685 checkType(c,o,REDIS_SET)) return;
4686
4687 s = o->ptr;
4688 addReplyUlong(c,dictSize(s));
4689 }
4690
4691 static void spopCommand(redisClient *c) {
4692 robj *set;
4693 dictEntry *de;
4694
4695 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4696 checkType(c,set,REDIS_SET)) return;
4697
4698 de = dictGetRandomKey(set->ptr);
4699 if (de == NULL) {
4700 addReply(c,shared.nullbulk);
4701 } else {
4702 robj *ele = dictGetEntryKey(de);
4703
4704 addReplyBulk(c,ele);
4705 dictDelete(set->ptr,ele);
4706 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4707 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4708 server.dirty++;
4709 }
4710 }
4711
4712 static void srandmemberCommand(redisClient *c) {
4713 robj *set;
4714 dictEntry *de;
4715
4716 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4717 checkType(c,set,REDIS_SET)) return;
4718
4719 de = dictGetRandomKey(set->ptr);
4720 if (de == NULL) {
4721 addReply(c,shared.nullbulk);
4722 } else {
4723 robj *ele = dictGetEntryKey(de);
4724
4725 addReplyBulk(c,ele);
4726 }
4727 }
4728
4729 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4730 dict **d1 = (void*) s1, **d2 = (void*) s2;
4731
4732 return dictSize(*d1)-dictSize(*d2);
4733 }
4734
4735 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4736 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4737 dictIterator *di;
4738 dictEntry *de;
4739 robj *lenobj = NULL, *dstset = NULL;
4740 unsigned long j, cardinality = 0;
4741
4742 for (j = 0; j < setsnum; j++) {
4743 robj *setobj;
4744
4745 setobj = dstkey ?
4746 lookupKeyWrite(c->db,setskeys[j]) :
4747 lookupKeyRead(c->db,setskeys[j]);
4748 if (!setobj) {
4749 zfree(dv);
4750 if (dstkey) {
4751 if (deleteKey(c->db,dstkey))
4752 server.dirty++;
4753 addReply(c,shared.czero);
4754 } else {
4755 addReply(c,shared.nullmultibulk);
4756 }
4757 return;
4758 }
4759 if (setobj->type != REDIS_SET) {
4760 zfree(dv);
4761 addReply(c,shared.wrongtypeerr);
4762 return;
4763 }
4764 dv[j] = setobj->ptr;
4765 }
4766 /* Sort sets from the smallest to largest, this will improve our
4767 * algorithm's performace */
4768 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4769
4770 /* The first thing we should output is the total number of elements...
4771 * since this is a multi-bulk write, but at this stage we don't know
4772 * the intersection set size, so we use a trick, append an empty object
4773 * to the output list and save the pointer to later modify it with the
4774 * right length */
4775 if (!dstkey) {
4776 lenobj = createObject(REDIS_STRING,NULL);
4777 addReply(c,lenobj);
4778 decrRefCount(lenobj);
4779 } else {
4780 /* If we have a target key where to store the resulting set
4781 * create this key with an empty set inside */
4782 dstset = createSetObject();
4783 }
4784
4785 /* Iterate all the elements of the first (smallest) set, and test
4786 * the element against all the other sets, if at least one set does
4787 * not include the element it is discarded */
4788 di = dictGetIterator(dv[0]);
4789
4790 while((de = dictNext(di)) != NULL) {
4791 robj *ele;
4792
4793 for (j = 1; j < setsnum; j++)
4794 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4795 if (j != setsnum)
4796 continue; /* at least one set does not contain the member */
4797 ele = dictGetEntryKey(de);
4798 if (!dstkey) {
4799 addReplyBulk(c,ele);
4800 cardinality++;
4801 } else {
4802 dictAdd(dstset->ptr,ele,NULL);
4803 incrRefCount(ele);
4804 }
4805 }
4806 dictReleaseIterator(di);
4807
4808 if (dstkey) {
4809 /* Store the resulting set into the target, if the intersection
4810 * is not an empty set. */
4811 deleteKey(c->db,dstkey);
4812 if (dictSize((dict*)dstset->ptr) > 0) {
4813 dictAdd(c->db->dict,dstkey,dstset);
4814 incrRefCount(dstkey);
4815 addReplyLong(c,dictSize((dict*)dstset->ptr));
4816 } else {
4817 decrRefCount(dstset);
4818 addReply(c,shared.czero);
4819 }
4820 server.dirty++;
4821 } else {
4822 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4823 }
4824 zfree(dv);
4825 }
4826
4827 static void sinterCommand(redisClient *c) {
4828 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4829 }
4830
4831 static void sinterstoreCommand(redisClient *c) {
4832 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4833 }
4834
4835 #define REDIS_OP_UNION 0
4836 #define REDIS_OP_DIFF 1
4837 #define REDIS_OP_INTER 2
4838
4839 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4840 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4841 dictIterator *di;
4842 dictEntry *de;
4843 robj *dstset = NULL;
4844 int j, cardinality = 0;
4845
4846 for (j = 0; j < setsnum; j++) {
4847 robj *setobj;
4848
4849 setobj = dstkey ?
4850 lookupKeyWrite(c->db,setskeys[j]) :
4851 lookupKeyRead(c->db,setskeys[j]);
4852 if (!setobj) {
4853 dv[j] = NULL;
4854 continue;
4855 }
4856 if (setobj->type != REDIS_SET) {
4857 zfree(dv);
4858 addReply(c,shared.wrongtypeerr);
4859 return;
4860 }
4861 dv[j] = setobj->ptr;
4862 }
4863
4864 /* We need a temp set object to store our union. If the dstkey
4865 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4866 * this set object will be the resulting object to set into the target key*/
4867 dstset = createSetObject();
4868
4869 /* Iterate all the elements of all the sets, add every element a single
4870 * time to the result set */
4871 for (j = 0; j < setsnum; j++) {
4872 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4873 if (!dv[j]) continue; /* non existing keys are like empty sets */
4874
4875 di = dictGetIterator(dv[j]);
4876
4877 while((de = dictNext(di)) != NULL) {
4878 robj *ele;
4879
4880 /* dictAdd will not add the same element multiple times */
4881 ele = dictGetEntryKey(de);
4882 if (op == REDIS_OP_UNION || j == 0) {
4883 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4884 incrRefCount(ele);
4885 cardinality++;
4886 }
4887 } else if (op == REDIS_OP_DIFF) {
4888 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4889 cardinality--;
4890 }
4891 }
4892 }
4893 dictReleaseIterator(di);
4894
4895 /* result set is empty? Exit asap. */
4896 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4897 }
4898
4899 /* Output the content of the resulting set, if not in STORE mode */
4900 if (!dstkey) {
4901 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4902 di = dictGetIterator(dstset->ptr);
4903 while((de = dictNext(di)) != NULL) {
4904 robj *ele;
4905
4906 ele = dictGetEntryKey(de);
4907 addReplyBulk(c,ele);
4908 }
4909 dictReleaseIterator(di);
4910 decrRefCount(dstset);
4911 } else {
4912 /* If we have a target key where to store the resulting set
4913 * create this key with the result set inside */
4914 deleteKey(c->db,dstkey);
4915 if (dictSize((dict*)dstset->ptr) > 0) {
4916 dictAdd(c->db->dict,dstkey,dstset);
4917 incrRefCount(dstkey);
4918 addReplyLong(c,dictSize((dict*)dstset->ptr));
4919 } else {
4920 decrRefCount(dstset);
4921 addReply(c,shared.czero);
4922 }
4923 server.dirty++;
4924 }
4925 zfree(dv);
4926 }
4927
4928 static void sunionCommand(redisClient *c) {
4929 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4930 }
4931
4932 static void sunionstoreCommand(redisClient *c) {
4933 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4934 }
4935
4936 static void sdiffCommand(redisClient *c) {
4937 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4938 }
4939
4940 static void sdiffstoreCommand(redisClient *c) {
4941 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4942 }
4943
4944 /* ==================================== ZSets =============================== */
4945
4946 /* ZSETs are ordered sets using two data structures to hold the same elements
4947 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4948 * data structure.
4949 *
4950 * The elements are added to an hash table mapping Redis objects to scores.
4951 * At the same time the elements are added to a skip list mapping scores
4952 * to Redis objects (so objects are sorted by scores in this "view"). */
4953
4954 /* This skiplist implementation is almost a C translation of the original
4955 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4956 * Alternative to Balanced Trees", modified in three ways:
4957 * a) this implementation allows for repeated values.
4958 * b) the comparison is not just by key (our 'score') but by satellite data.
4959 * c) there is a back pointer, so it's a doubly linked list with the back
4960 * pointers being only at "level 1". This allows to traverse the list
4961 * from tail to head, useful for ZREVRANGE. */
4962
4963 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4964 zskiplistNode *zn = zmalloc(sizeof(*zn));
4965
4966 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4967 if (level > 0)
4968 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
4969 zn->score = score;
4970 zn->obj = obj;
4971 return zn;
4972 }
4973
4974 static zskiplist *zslCreate(void) {
4975 int j;
4976 zskiplist *zsl;
4977
4978 zsl = zmalloc(sizeof(*zsl));
4979 zsl->level = 1;
4980 zsl->length = 0;
4981 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4982 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4983 zsl->header->forward[j] = NULL;
4984
4985 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4986 if (j < ZSKIPLIST_MAXLEVEL-1)
4987 zsl->header->span[j] = 0;
4988 }
4989 zsl->header->backward = NULL;
4990 zsl->tail = NULL;
4991 return zsl;
4992 }
4993
4994 static void zslFreeNode(zskiplistNode *node) {
4995 decrRefCount(node->obj);
4996 zfree(node->forward);
4997 zfree(node->span);
4998 zfree(node);
4999 }
5000
5001 static void zslFree(zskiplist *zsl) {
5002 zskiplistNode *node = zsl->header->forward[0], *next;
5003
5004 zfree(zsl->header->forward);
5005 zfree(zsl->header->span);
5006 zfree(zsl->header);
5007 while(node) {
5008 next = node->forward[0];
5009 zslFreeNode(node);
5010 node = next;
5011 }
5012 zfree(zsl);
5013 }
5014
5015 static int zslRandomLevel(void) {
5016 int level = 1;
5017 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5018 level += 1;
5019 return level;
5020 }
5021
5022 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5023 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5024 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5025 int i, level;
5026
5027 x = zsl->header;
5028 for (i = zsl->level-1; i >= 0; i--) {
5029 /* store rank that is crossed to reach the insert position */
5030 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5031
5032 while (x->forward[i] &&
5033 (x->forward[i]->score < score ||
5034 (x->forward[i]->score == score &&
5035 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5036 rank[i] += i > 0 ? x->span[i-1] : 1;
5037 x = x->forward[i];
5038 }
5039 update[i] = x;
5040 }
5041 /* we assume the key is not already inside, since we allow duplicated
5042 * scores, and the re-insertion of score and redis object should never
5043 * happpen since the caller of zslInsert() should test in the hash table
5044 * if the element is already inside or not. */
5045 level = zslRandomLevel();
5046 if (level > zsl->level) {
5047 for (i = zsl->level; i < level; i++) {
5048 rank[i] = 0;
5049 update[i] = zsl->header;
5050 update[i]->span[i-1] = zsl->length;
5051 }
5052 zsl->level = level;
5053 }
5054 x = zslCreateNode(level,score,obj);
5055 for (i = 0; i < level; i++) {
5056 x->forward[i] = update[i]->forward[i];
5057 update[i]->forward[i] = x;
5058
5059 /* update span covered by update[i] as x is inserted here */
5060 if (i > 0) {
5061 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5062 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5063 }
5064 }
5065
5066 /* increment span for untouched levels */
5067 for (i = level; i < zsl->level; i++) {
5068 update[i]->span[i-1]++;
5069 }
5070
5071 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5072 if (x->forward[0])
5073 x->forward[0]->backward = x;
5074 else
5075 zsl->tail = x;
5076 zsl->length++;
5077 }
5078
5079 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5080 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5081 int i;
5082 for (i = 0; i < zsl->level; i++) {
5083 if (update[i]->forward[i] == x) {
5084 if (i > 0) {
5085 update[i]->span[i-1] += x->span[i-1] - 1;
5086 }
5087 update[i]->forward[i] = x->forward[i];
5088 } else {
5089 /* invariant: i > 0, because update[0]->forward[0]
5090 * is always equal to x */
5091 update[i]->span[i-1] -= 1;
5092 }
5093 }
5094 if (x->forward[0]) {
5095 x->forward[0]->backward = x->backward;
5096 } else {
5097 zsl->tail = x->backward;
5098 }
5099 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5100 zsl->level--;
5101 zsl->length--;
5102 }
5103
5104 /* Delete an element with matching score/object from the skiplist. */
5105 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5106 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5107 int i;
5108
5109 x = zsl->header;
5110 for (i = zsl->level-1; i >= 0; i--) {
5111 while (x->forward[i] &&
5112 (x->forward[i]->score < score ||
5113 (x->forward[i]->score == score &&
5114 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5115 x = x->forward[i];
5116 update[i] = x;
5117 }
5118 /* We may have multiple elements with the same score, what we need
5119 * is to find the element with both the right score and object. */
5120 x = x->forward[0];
5121 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5122 zslDeleteNode(zsl, x, update);
5123 zslFreeNode(x);
5124 return 1;
5125 } else {
5126 return 0; /* not found */
5127 }
5128 return 0; /* not found */
5129 }
5130
5131 /* Delete all the elements with score between min and max from the skiplist.
5132 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5133 * Note that this function takes the reference to the hash table view of the
5134 * sorted set, in order to remove the elements from the hash table too. */
5135 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5136 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5137 unsigned long removed = 0;
5138 int i;
5139
5140 x = zsl->header;
5141 for (i = zsl->level-1; i >= 0; i--) {
5142 while (x->forward[i] && x->forward[i]->score < min)
5143 x = x->forward[i];
5144 update[i] = x;
5145 }
5146 /* We may have multiple elements with the same score, what we need
5147 * is to find the element with both the right score and object. */
5148 x = x->forward[0];
5149 while (x && x->score <= max) {
5150 zskiplistNode *next = x->forward[0];
5151 zslDeleteNode(zsl, x, update);
5152 dictDelete(dict,x->obj);
5153 zslFreeNode(x);
5154 removed++;
5155 x = next;
5156 }
5157 return removed; /* not found */
5158 }
5159
5160 /* Delete all the elements with rank between start and end from the skiplist.
5161 * Start and end are inclusive. Note that start and end need to be 1-based */
5162 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5163 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5164 unsigned long traversed = 0, removed = 0;
5165 int i;
5166
5167 x = zsl->header;
5168 for (i = zsl->level-1; i >= 0; i--) {
5169 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5170 traversed += i > 0 ? x->span[i-1] : 1;
5171 x = x->forward[i];
5172 }
5173 update[i] = x;
5174 }
5175
5176 traversed++;
5177 x = x->forward[0];
5178 while (x && traversed <= end) {
5179 zskiplistNode *next = x->forward[0];
5180 zslDeleteNode(zsl, x, update);
5181 dictDelete(dict,x->obj);
5182 zslFreeNode(x);
5183 removed++;
5184 traversed++;
5185 x = next;
5186 }
5187 return removed;
5188 }
5189
5190 /* Find the first node having a score equal or greater than the specified one.
5191 * Returns NULL if there is no match. */
5192 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5193 zskiplistNode *x;
5194 int i;
5195
5196 x = zsl->header;
5197 for (i = zsl->level-1; i >= 0; i--) {
5198 while (x->forward[i] && x->forward[i]->score < score)
5199 x = x->forward[i];
5200 }
5201 /* We may have multiple elements with the same score, what we need
5202 * is to find the element with both the right score and object. */
5203 return x->forward[0];
5204 }
5205
5206 /* Find the rank for an element by both score and key.
5207 * Returns 0 when the element cannot be found, rank otherwise.
5208 * Note that the rank is 1-based due to the span of zsl->header to the
5209 * first element. */
5210 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5211 zskiplistNode *x;
5212 unsigned long rank = 0;
5213 int i;
5214
5215 x = zsl->header;
5216 for (i = zsl->level-1; i >= 0; i--) {
5217 while (x->forward[i] &&
5218 (x->forward[i]->score < score ||
5219 (x->forward[i]->score == score &&
5220 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5221 rank += i > 0 ? x->span[i-1] : 1;
5222 x = x->forward[i];
5223 }
5224
5225 /* x might be equal to zsl->header, so test if obj is non-NULL */
5226 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5227 return rank;
5228 }
5229 }
5230 return 0;
5231 }
5232
5233 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5234 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5235 zskiplistNode *x;
5236 unsigned long traversed = 0;
5237 int i;
5238
5239 x = zsl->header;
5240 for (i = zsl->level-1; i >= 0; i--) {
5241 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5242 {
5243 traversed += i > 0 ? x->span[i-1] : 1;
5244 x = x->forward[i];
5245 }
5246 if (traversed == rank) {
5247 return x;
5248 }
5249 }
5250 return NULL;
5251 }
5252
5253 /* The actual Z-commands implementations */
5254
5255 /* This generic command implements both ZADD and ZINCRBY.
5256 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5257 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5258 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5259 robj *zsetobj;
5260 zset *zs;
5261 double *score;
5262
5263 zsetobj = lookupKeyWrite(c->db,key);
5264 if (zsetobj == NULL) {
5265 zsetobj = createZsetObject();
5266 dictAdd(c->db->dict,key,zsetobj);
5267 incrRefCount(key);
5268 } else {
5269 if (zsetobj->type != REDIS_ZSET) {
5270 addReply(c,shared.wrongtypeerr);
5271 return;
5272 }
5273 }
5274 zs = zsetobj->ptr;
5275
5276 /* Ok now since we implement both ZADD and ZINCRBY here the code
5277 * needs to handle the two different conditions. It's all about setting
5278 * '*score', that is, the new score to set, to the right value. */
5279 score = zmalloc(sizeof(double));
5280 if (doincrement) {
5281 dictEntry *de;
5282
5283 /* Read the old score. If the element was not present starts from 0 */
5284 de = dictFind(zs->dict,ele);
5285 if (de) {
5286 double *oldscore = dictGetEntryVal(de);
5287 *score = *oldscore + scoreval;
5288 } else {
5289 *score = scoreval;
5290 }
5291 } else {
5292 *score = scoreval;
5293 }
5294
5295 /* What follows is a simple remove and re-insert operation that is common
5296 * to both ZADD and ZINCRBY... */
5297 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5298 /* case 1: New element */
5299 incrRefCount(ele); /* added to hash */
5300 zslInsert(zs->zsl,*score,ele);
5301 incrRefCount(ele); /* added to skiplist */
5302 server.dirty++;
5303 if (doincrement)
5304 addReplyDouble(c,*score);
5305 else
5306 addReply(c,shared.cone);
5307 } else {
5308 dictEntry *de;
5309 double *oldscore;
5310
5311 /* case 2: Score update operation */
5312 de = dictFind(zs->dict,ele);
5313 redisAssert(de != NULL);
5314 oldscore = dictGetEntryVal(de);
5315 if (*score != *oldscore) {
5316 int deleted;
5317
5318 /* Remove and insert the element in the skip list with new score */
5319 deleted = zslDelete(zs->zsl,*oldscore,ele);
5320 redisAssert(deleted != 0);
5321 zslInsert(zs->zsl,*score,ele);
5322 incrRefCount(ele);
5323 /* Update the score in the hash table */
5324 dictReplace(zs->dict,ele,score);
5325 server.dirty++;
5326 } else {
5327 zfree(score);
5328 }
5329 if (doincrement)
5330 addReplyDouble(c,*score);
5331 else
5332 addReply(c,shared.czero);
5333 }
5334 }
5335
5336 static void zaddCommand(redisClient *c) {
5337 double scoreval;
5338
5339 scoreval = strtod(c->argv[2]->ptr,NULL);
5340 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5341 }
5342
5343 static void zincrbyCommand(redisClient *c) {
5344 double scoreval;
5345
5346 scoreval = strtod(c->argv[2]->ptr,NULL);
5347 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5348 }
5349
5350 static void zremCommand(redisClient *c) {
5351 robj *zsetobj;
5352 zset *zs;
5353 dictEntry *de;
5354 double *oldscore;
5355 int deleted;
5356
5357 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5358 checkType(c,zsetobj,REDIS_ZSET)) return;
5359
5360 zs = zsetobj->ptr;
5361 de = dictFind(zs->dict,c->argv[2]);
5362 if (de == NULL) {
5363 addReply(c,shared.czero);
5364 return;
5365 }
5366 /* Delete from the skiplist */
5367 oldscore = dictGetEntryVal(de);
5368 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5369 redisAssert(deleted != 0);
5370
5371 /* Delete from the hash table */
5372 dictDelete(zs->dict,c->argv[2]);
5373 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5374 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5375 server.dirty++;
5376 addReply(c,shared.cone);
5377 }
5378
5379 static void zremrangebyscoreCommand(redisClient *c) {
5380 double min = strtod(c->argv[2]->ptr,NULL);
5381 double max = strtod(c->argv[3]->ptr,NULL);
5382 long deleted;
5383 robj *zsetobj;
5384 zset *zs;
5385
5386 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5387 checkType(c,zsetobj,REDIS_ZSET)) return;
5388
5389 zs = zsetobj->ptr;
5390 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5391 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5392 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5393 server.dirty += deleted;
5394 addReplyLong(c,deleted);
5395 }
5396
5397 static void zremrangebyrankCommand(redisClient *c) {
5398 int start = atoi(c->argv[2]->ptr);
5399 int end = atoi(c->argv[3]->ptr);
5400 int llen;
5401 long deleted;
5402 robj *zsetobj;
5403 zset *zs;
5404
5405 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5406 checkType(c,zsetobj,REDIS_ZSET)) return;
5407 zs = zsetobj->ptr;
5408 llen = zs->zsl->length;
5409
5410 /* convert negative indexes */
5411 if (start < 0) start = llen+start;
5412 if (end < 0) end = llen+end;
5413 if (start < 0) start = 0;
5414 if (end < 0) end = 0;
5415
5416 /* indexes sanity checks */
5417 if (start > end || start >= llen) {
5418 addReply(c,shared.czero);
5419 return;
5420 }
5421 if (end >= llen) end = llen-1;
5422
5423 /* increment start and end because zsl*Rank functions
5424 * use 1-based rank */
5425 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5426 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5427 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5428 server.dirty += deleted;
5429 addReplyLong(c, deleted);
5430 }
5431
5432 typedef struct {
5433 dict *dict;
5434 double weight;
5435 } zsetopsrc;
5436
5437 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5438 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5439 unsigned long size1, size2;
5440 size1 = d1->dict ? dictSize(d1->dict) : 0;
5441 size2 = d2->dict ? dictSize(d2->dict) : 0;
5442 return size1 - size2;
5443 }
5444
5445 #define REDIS_AGGR_SUM 1
5446 #define REDIS_AGGR_MIN 2
5447 #define REDIS_AGGR_MAX 3
5448
5449 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5450 if (aggregate == REDIS_AGGR_SUM) {
5451 *target = *target + val;
5452 } else if (aggregate == REDIS_AGGR_MIN) {
5453 *target = val < *target ? val : *target;
5454 } else if (aggregate == REDIS_AGGR_MAX) {
5455 *target = val > *target ? val : *target;
5456 } else {
5457 /* safety net */
5458 redisAssert(0 != 0);
5459 }
5460 }
5461
5462 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5463 int i, j, zsetnum;
5464 int aggregate = REDIS_AGGR_SUM;
5465 zsetopsrc *src;
5466 robj *dstobj;
5467 zset *dstzset;
5468 dictIterator *di;
5469 dictEntry *de;
5470
5471 /* expect zsetnum input keys to be given */
5472 zsetnum = atoi(c->argv[2]->ptr);
5473 if (zsetnum < 1) {
5474 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5475 return;
5476 }
5477
5478 /* test if the expected number of keys would overflow */
5479 if (3+zsetnum > c->argc) {
5480 addReply(c,shared.syntaxerr);
5481 return;
5482 }
5483
5484 /* read keys to be used for input */
5485 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5486 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5487 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5488 if (!zsetobj) {
5489 src[i].dict = NULL;
5490 } else {
5491 if (zsetobj->type != REDIS_ZSET) {
5492 zfree(src);
5493 addReply(c,shared.wrongtypeerr);
5494 return;
5495 }
5496 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5497 }
5498
5499 /* default all weights to 1 */
5500 src[i].weight = 1.0;
5501 }
5502
5503 /* parse optional extra arguments */
5504 if (j < c->argc) {
5505 int remaining = c->argc - j;
5506
5507 while (remaining) {
5508 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5509 j++; remaining--;
5510 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5511 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5512 }
5513 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5514 j++; remaining--;
5515 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5516 aggregate = REDIS_AGGR_SUM;
5517 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5518 aggregate = REDIS_AGGR_MIN;
5519 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5520 aggregate = REDIS_AGGR_MAX;
5521 } else {
5522 zfree(src);
5523 addReply(c,shared.syntaxerr);
5524 return;
5525 }
5526 j++; remaining--;
5527 } else {
5528 zfree(src);
5529 addReply(c,shared.syntaxerr);
5530 return;
5531 }
5532 }
5533 }
5534
5535 /* sort sets from the smallest to largest, this will improve our
5536 * algorithm's performance */
5537 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5538
5539 dstobj = createZsetObject();
5540 dstzset = dstobj->ptr;
5541
5542 if (op == REDIS_OP_INTER) {
5543 /* skip going over all entries if the smallest zset is NULL or empty */
5544 if (src[0].dict && dictSize(src[0].dict) > 0) {
5545 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5546 * from small to large, all src[i > 0].dict are non-empty too */
5547 di = dictGetIterator(src[0].dict);
5548 while((de = dictNext(di)) != NULL) {
5549 double *score = zmalloc(sizeof(double)), value;
5550 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5551
5552 for (j = 1; j < zsetnum; j++) {
5553 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5554 if (other) {
5555 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5556 zunionInterAggregate(score, value, aggregate);
5557 } else {
5558 break;
5559 }
5560 }
5561
5562 /* skip entry when not present in every source dict */
5563 if (j != zsetnum) {
5564 zfree(score);
5565 } else {
5566 robj *o = dictGetEntryKey(de);
5567 dictAdd(dstzset->dict,o,score);
5568 incrRefCount(o); /* added to dictionary */
5569 zslInsert(dstzset->zsl,*score,o);
5570 incrRefCount(o); /* added to skiplist */
5571 }
5572 }
5573 dictReleaseIterator(di);
5574 }
5575 } else if (op == REDIS_OP_UNION) {
5576 for (i = 0; i < zsetnum; i++) {
5577 if (!src[i].dict) continue;
5578
5579 di = dictGetIterator(src[i].dict);
5580 while((de = dictNext(di)) != NULL) {
5581 /* skip key when already processed */
5582 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5583
5584 double *score = zmalloc(sizeof(double)), value;
5585 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5586
5587 /* because the zsets are sorted by size, its only possible
5588 * for sets at larger indices to hold this entry */
5589 for (j = (i+1); j < zsetnum; j++) {
5590 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5591 if (other) {
5592 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5593 zunionInterAggregate(score, value, aggregate);
5594 }
5595 }
5596
5597 robj *o = dictGetEntryKey(de);
5598 dictAdd(dstzset->dict,o,score);
5599 incrRefCount(o); /* added to dictionary */
5600 zslInsert(dstzset->zsl,*score,o);
5601 incrRefCount(o); /* added to skiplist */
5602 }
5603 dictReleaseIterator(di);
5604 }
5605 } else {
5606 /* unknown operator */
5607 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5608 }
5609
5610 deleteKey(c->db,dstkey);
5611 if (dstzset->zsl->length) {
5612 dictAdd(c->db->dict,dstkey,dstobj);
5613 incrRefCount(dstkey);
5614 addReplyLong(c, dstzset->zsl->length);
5615 server.dirty++;
5616 } else {
5617 decrRefCount(dstzset);
5618 addReply(c, shared.czero);
5619 }
5620 zfree(src);
5621 }
5622
5623 static void zunionCommand(redisClient *c) {
5624 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5625 }
5626
5627 static void zinterCommand(redisClient *c) {
5628 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5629 }
5630
5631 static void zrangeGenericCommand(redisClient *c, int reverse) {
5632 robj *o;
5633 int start = atoi(c->argv[2]->ptr);
5634 int end = atoi(c->argv[3]->ptr);
5635 int withscores = 0;
5636 int llen;
5637 int rangelen, j;
5638 zset *zsetobj;
5639 zskiplist *zsl;
5640 zskiplistNode *ln;
5641 robj *ele;
5642
5643 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5644 withscores = 1;
5645 } else if (c->argc >= 5) {
5646 addReply(c,shared.syntaxerr);
5647 return;
5648 }
5649
5650 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5651 checkType(c,o,REDIS_ZSET)) return;
5652 zsetobj = o->ptr;
5653 zsl = zsetobj->zsl;
5654 llen = zsl->length;
5655
5656 /* convert negative indexes */
5657 if (start < 0) start = llen+start;
5658 if (end < 0) end = llen+end;
5659 if (start < 0) start = 0;
5660 if (end < 0) end = 0;
5661
5662 /* indexes sanity checks */
5663 if (start > end || start >= llen) {
5664 /* Out of range start or start > end result in empty list */
5665 addReply(c,shared.emptymultibulk);
5666 return;
5667 }
5668 if (end >= llen) end = llen-1;
5669 rangelen = (end-start)+1;
5670
5671 /* check if starting point is trivial, before searching
5672 * the element in log(N) time */
5673 if (reverse) {
5674 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5675 } else {
5676 ln = start == 0 ?
5677 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5678 }
5679
5680 /* Return the result in form of a multi-bulk reply */
5681 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5682 withscores ? (rangelen*2) : rangelen));
5683 for (j = 0; j < rangelen; j++) {
5684 ele = ln->obj;
5685 addReplyBulk(c,ele);
5686 if (withscores)
5687 addReplyDouble(c,ln->score);
5688 ln = reverse ? ln->backward : ln->forward[0];
5689 }
5690 }
5691
5692 static void zrangeCommand(redisClient *c) {
5693 zrangeGenericCommand(c,0);
5694 }
5695
5696 static void zrevrangeCommand(redisClient *c) {
5697 zrangeGenericCommand(c,1);
5698 }
5699
5700 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5701 * If justcount is non-zero, just the count is returned. */
5702 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5703 robj *o;
5704 double min, max;
5705 int minex = 0, maxex = 0; /* are min or max exclusive? */
5706 int offset = 0, limit = -1;
5707 int withscores = 0;
5708 int badsyntax = 0;
5709
5710 /* Parse the min-max interval. If one of the values is prefixed
5711 * by the "(" character, it's considered "open". For instance
5712 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5713 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5714 if (((char*)c->argv[2]->ptr)[0] == '(') {
5715 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5716 minex = 1;
5717 } else {
5718 min = strtod(c->argv[2]->ptr,NULL);
5719 }
5720 if (((char*)c->argv[3]->ptr)[0] == '(') {
5721 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5722 maxex = 1;
5723 } else {
5724 max = strtod(c->argv[3]->ptr,NULL);
5725 }
5726
5727 /* Parse "WITHSCORES": note that if the command was called with
5728 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5729 * enter the following paths to parse WITHSCORES and LIMIT. */
5730 if (c->argc == 5 || c->argc == 8) {
5731 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5732 withscores = 1;
5733 else
5734 badsyntax = 1;
5735 }
5736 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5737 badsyntax = 1;
5738 if (badsyntax) {
5739 addReplySds(c,
5740 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5741 return;
5742 }
5743
5744 /* Parse "LIMIT" */
5745 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5746 addReply(c,shared.syntaxerr);
5747 return;
5748 } else if (c->argc == (7 + withscores)) {
5749 offset = atoi(c->argv[5]->ptr);
5750 limit = atoi(c->argv[6]->ptr);
5751 if (offset < 0) offset = 0;
5752 }
5753
5754 /* Ok, lookup the key and get the range */
5755 o = lookupKeyRead(c->db,c->argv[1]);
5756 if (o == NULL) {
5757 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5758 } else {
5759 if (o->type != REDIS_ZSET) {
5760 addReply(c,shared.wrongtypeerr);
5761 } else {
5762 zset *zsetobj = o->ptr;
5763 zskiplist *zsl = zsetobj->zsl;
5764 zskiplistNode *ln;
5765 robj *ele, *lenobj = NULL;
5766 unsigned long rangelen = 0;
5767
5768 /* Get the first node with the score >= min, or with
5769 * score > min if 'minex' is true. */
5770 ln = zslFirstWithScore(zsl,min);
5771 while (minex && ln && ln->score == min) ln = ln->forward[0];
5772
5773 if (ln == NULL) {
5774 /* No element matching the speciifed interval */
5775 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5776 return;
5777 }
5778
5779 /* We don't know in advance how many matching elements there
5780 * are in the list, so we push this object that will represent
5781 * the multi-bulk length in the output buffer, and will "fix"
5782 * it later */
5783 if (!justcount) {
5784 lenobj = createObject(REDIS_STRING,NULL);
5785 addReply(c,lenobj);
5786 decrRefCount(lenobj);
5787 }
5788
5789 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5790 if (offset) {
5791 offset--;
5792 ln = ln->forward[0];
5793 continue;
5794 }
5795 if (limit == 0) break;
5796 if (!justcount) {
5797 ele = ln->obj;
5798 addReplyBulk(c,ele);
5799 if (withscores)
5800 addReplyDouble(c,ln->score);
5801 }
5802 ln = ln->forward[0];
5803 rangelen++;
5804 if (limit > 0) limit--;
5805 }
5806 if (justcount) {
5807 addReplyLong(c,(long)rangelen);
5808 } else {
5809 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5810 withscores ? (rangelen*2) : rangelen);
5811 }
5812 }
5813 }
5814 }
5815
5816 static void zrangebyscoreCommand(redisClient *c) {
5817 genericZrangebyscoreCommand(c,0);
5818 }
5819
5820 static void zcountCommand(redisClient *c) {
5821 genericZrangebyscoreCommand(c,1);
5822 }
5823
5824 static void zcardCommand(redisClient *c) {
5825 robj *o;
5826 zset *zs;
5827
5828 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5829 checkType(c,o,REDIS_ZSET)) return;
5830
5831 zs = o->ptr;
5832 addReplyUlong(c,zs->zsl->length);
5833 }
5834
5835 static void zscoreCommand(redisClient *c) {
5836 robj *o;
5837 zset *zs;
5838 dictEntry *de;
5839
5840 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5841 checkType(c,o,REDIS_ZSET)) return;
5842
5843 zs = o->ptr;
5844 de = dictFind(zs->dict,c->argv[2]);
5845 if (!de) {
5846 addReply(c,shared.nullbulk);
5847 } else {
5848 double *score = dictGetEntryVal(de);
5849
5850 addReplyDouble(c,*score);
5851 }
5852 }
5853
5854 static void zrankGenericCommand(redisClient *c, int reverse) {
5855 robj *o;
5856 zset *zs;
5857 zskiplist *zsl;
5858 dictEntry *de;
5859 unsigned long rank;
5860 double *score;
5861
5862 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5863 checkType(c,o,REDIS_ZSET)) return;
5864
5865 zs = o->ptr;
5866 zsl = zs->zsl;
5867 de = dictFind(zs->dict,c->argv[2]);
5868 if (!de) {
5869 addReply(c,shared.nullbulk);
5870 return;
5871 }
5872
5873 score = dictGetEntryVal(de);
5874 rank = zslGetRank(zsl, *score, c->argv[2]);
5875 if (rank) {
5876 if (reverse) {
5877 addReplyLong(c, zsl->length - rank);
5878 } else {
5879 addReplyLong(c, rank-1);
5880 }
5881 } else {
5882 addReply(c,shared.nullbulk);
5883 }
5884 }
5885
5886 static void zrankCommand(redisClient *c) {
5887 zrankGenericCommand(c, 0);
5888 }
5889
5890 static void zrevrankCommand(redisClient *c) {
5891 zrankGenericCommand(c, 1);
5892 }
5893
5894 /* =================================== Hashes =============================== */
5895 static void hsetCommand(redisClient *c) {
5896 int update = 0;
5897 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5898
5899 if (o == NULL) {
5900 o = createHashObject();
5901 dictAdd(c->db->dict,c->argv[1],o);
5902 incrRefCount(c->argv[1]);
5903 } else {
5904 if (o->type != REDIS_HASH) {
5905 addReply(c,shared.wrongtypeerr);
5906 return;
5907 }
5908 }
5909 /* We want to convert the zipmap into an hash table right now if the
5910 * entry to be added is too big. Note that we check if the object
5911 * is integer encoded before to try fetching the length in the test below.
5912 * This is because integers are small, but currently stringObjectLen()
5913 * performs a slow conversion: not worth it. */
5914 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5915 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5916 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5917 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5918 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5919 {
5920 convertToRealHash(o);
5921 }
5922
5923 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5924 unsigned char *zm = o->ptr;
5925 robj *valobj = getDecodedObject(c->argv[3]);
5926
5927 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5928 valobj->ptr,sdslen(valobj->ptr),&update);
5929 decrRefCount(valobj);
5930 o->ptr = zm;
5931
5932 /* And here there is the second check for hash conversion...
5933 * we want to do it only if the operation was not just an update as
5934 * zipmapLen() is O(N). */
5935 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
5936 convertToRealHash(o);
5937 } else {
5938 tryObjectEncoding(c->argv[2]);
5939 /* note that c->argv[3] is already encoded, as the latest arg
5940 * of a bulk command is always integer encoded if possible. */
5941 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5942 incrRefCount(c->argv[2]);
5943 } else {
5944 update = 1;
5945 }
5946 incrRefCount(c->argv[3]);
5947 }
5948 server.dirty++;
5949 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5950 }
5951
5952 static void hgetCommand(redisClient *c) {
5953 robj *o;
5954
5955 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5956 checkType(c,o,REDIS_HASH)) return;
5957
5958 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5959 unsigned char *zm = o->ptr;
5960 unsigned char *val;
5961 unsigned int vlen;
5962 robj *field;
5963
5964 field = getDecodedObject(c->argv[2]);
5965 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
5966 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
5967 addReplySds(c,sdsnewlen(val,vlen));
5968 addReply(c,shared.crlf);
5969 decrRefCount(field);
5970 return;
5971 } else {
5972 addReply(c,shared.nullbulk);
5973 decrRefCount(field);
5974 return;
5975 }
5976 } else {
5977 struct dictEntry *de;
5978
5979 de = dictFind(o->ptr,c->argv[2]);
5980 if (de == NULL) {
5981 addReply(c,shared.nullbulk);
5982 } else {
5983 robj *e = dictGetEntryVal(de);
5984
5985 addReplyBulk(c,e);
5986 }
5987 }
5988 }
5989
5990 static void hdelCommand(redisClient *c) {
5991 robj *o;
5992 int deleted = 0;
5993
5994 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5995 checkType(c,o,REDIS_HASH)) return;
5996
5997 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5998 robj *field = getDecodedObject(c->argv[2]);
5999
6000 o->ptr = zipmapDel((unsigned char*) o->ptr,
6001 (unsigned char*) field->ptr,
6002 sdslen(field->ptr), &deleted);
6003 decrRefCount(field);
6004 if (zipmapLen((unsigned char*) o->ptr) == 0)
6005 deleteKey(c->db,c->argv[1]);
6006 } else {
6007 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6008 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6009 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6010 }
6011 if (deleted) server.dirty++;
6012 addReply(c,deleted ? shared.cone : shared.czero);
6013 }
6014
6015 static void hlenCommand(redisClient *c) {
6016 robj *o;
6017 unsigned long len;
6018
6019 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6020 checkType(c,o,REDIS_HASH)) return;
6021
6022 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6023 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6024 addReplyUlong(c,len);
6025 }
6026
6027 #define REDIS_GETALL_KEYS 1
6028 #define REDIS_GETALL_VALS 2
6029 static void genericHgetallCommand(redisClient *c, int flags) {
6030 robj *o, *lenobj;
6031 unsigned long count = 0;
6032
6033 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6034 || checkType(c,o,REDIS_HASH)) return;
6035
6036 lenobj = createObject(REDIS_STRING,NULL);
6037 addReply(c,lenobj);
6038 decrRefCount(lenobj);
6039
6040 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6041 unsigned char *p = zipmapRewind(o->ptr);
6042 unsigned char *field, *val;
6043 unsigned int flen, vlen;
6044
6045 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6046 robj *aux;
6047
6048 if (flags & REDIS_GETALL_KEYS) {
6049 aux = createStringObject((char*)field,flen);
6050 addReplyBulk(c,aux);
6051 decrRefCount(aux);
6052 count++;
6053 }
6054 if (flags & REDIS_GETALL_VALS) {
6055 aux = createStringObject((char*)val,vlen);
6056 addReplyBulk(c,aux);
6057 decrRefCount(aux);
6058 count++;
6059 }
6060 }
6061 } else {
6062 dictIterator *di = dictGetIterator(o->ptr);
6063 dictEntry *de;
6064
6065 while((de = dictNext(di)) != NULL) {
6066 robj *fieldobj = dictGetEntryKey(de);
6067 robj *valobj = dictGetEntryVal(de);
6068
6069 if (flags & REDIS_GETALL_KEYS) {
6070 addReplyBulk(c,fieldobj);
6071 count++;
6072 }
6073 if (flags & REDIS_GETALL_VALS) {
6074 addReplyBulk(c,valobj);
6075 count++;
6076 }
6077 }
6078 dictReleaseIterator(di);
6079 }
6080 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6081 }
6082
6083 static void hkeysCommand(redisClient *c) {
6084 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6085 }
6086
6087 static void hvalsCommand(redisClient *c) {
6088 genericHgetallCommand(c,REDIS_GETALL_VALS);
6089 }
6090
6091 static void hgetallCommand(redisClient *c) {
6092 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6093 }
6094
6095 static void hexistsCommand(redisClient *c) {
6096 robj *o;
6097 int exists = 0;
6098
6099 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6100 checkType(c,o,REDIS_HASH)) return;
6101
6102 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6103 robj *field;
6104 unsigned char *zm = o->ptr;
6105
6106 field = getDecodedObject(c->argv[2]);
6107 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6108 decrRefCount(field);
6109 } else {
6110 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6111 }
6112 addReply(c,exists ? shared.cone : shared.czero);
6113 }
6114
6115 static void convertToRealHash(robj *o) {
6116 unsigned char *key, *val, *p, *zm = o->ptr;
6117 unsigned int klen, vlen;
6118 dict *dict = dictCreate(&hashDictType,NULL);
6119
6120 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6121 p = zipmapRewind(zm);
6122 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6123 robj *keyobj, *valobj;
6124
6125 keyobj = createStringObject((char*)key,klen);
6126 valobj = createStringObject((char*)val,vlen);
6127 tryObjectEncoding(keyobj);
6128 tryObjectEncoding(valobj);
6129 dictAdd(dict,keyobj,valobj);
6130 }
6131 o->encoding = REDIS_ENCODING_HT;
6132 o->ptr = dict;
6133 zfree(zm);
6134 }
6135
6136 /* ========================= Non type-specific commands ==================== */
6137
6138 static void flushdbCommand(redisClient *c) {
6139 server.dirty += dictSize(c->db->dict);
6140 dictEmpty(c->db->dict);
6141 dictEmpty(c->db->expires);
6142 addReply(c,shared.ok);
6143 }
6144
6145 static void flushallCommand(redisClient *c) {
6146 server.dirty += emptyDb();
6147 addReply(c,shared.ok);
6148 if (server.bgsavechildpid != -1) {
6149 kill(server.bgsavechildpid,SIGKILL);
6150 rdbRemoveTempFile(server.bgsavechildpid);
6151 }
6152 rdbSave(server.dbfilename);
6153 server.dirty++;
6154 }
6155
6156 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6157 redisSortOperation *so = zmalloc(sizeof(*so));
6158 so->type = type;
6159 so->pattern = pattern;
6160 return so;
6161 }
6162
6163 /* Return the value associated to the key with a name obtained
6164 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6165 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6166 char *p;
6167 sds spat, ssub;
6168 robj keyobj;
6169 int prefixlen, sublen, postfixlen;
6170 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6171 struct {
6172 long len;
6173 long free;
6174 char buf[REDIS_SORTKEY_MAX+1];
6175 } keyname;
6176
6177 /* If the pattern is "#" return the substitution object itself in order
6178 * to implement the "SORT ... GET #" feature. */
6179 spat = pattern->ptr;
6180 if (spat[0] == '#' && spat[1] == '\0') {
6181 return subst;
6182 }
6183
6184 /* The substitution object may be specially encoded. If so we create
6185 * a decoded object on the fly. Otherwise getDecodedObject will just
6186 * increment the ref count, that we'll decrement later. */
6187 subst = getDecodedObject(subst);
6188
6189 ssub = subst->ptr;
6190 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6191 p = strchr(spat,'*');
6192 if (!p) {
6193 decrRefCount(subst);
6194 return NULL;
6195 }
6196
6197 prefixlen = p-spat;
6198 sublen = sdslen(ssub);
6199 postfixlen = sdslen(spat)-(prefixlen+1);
6200 memcpy(keyname.buf,spat,prefixlen);
6201 memcpy(keyname.buf+prefixlen,ssub,sublen);
6202 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6203 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6204 keyname.len = prefixlen+sublen+postfixlen;
6205
6206 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6207 decrRefCount(subst);
6208
6209 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6210 return lookupKeyRead(db,&keyobj);
6211 }
6212
6213 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6214 * the additional parameter is not standard but a BSD-specific we have to
6215 * pass sorting parameters via the global 'server' structure */
6216 static int sortCompare(const void *s1, const void *s2) {
6217 const redisSortObject *so1 = s1, *so2 = s2;
6218 int cmp;
6219
6220 if (!server.sort_alpha) {
6221 /* Numeric sorting. Here it's trivial as we precomputed scores */
6222 if (so1->u.score > so2->u.score) {
6223 cmp = 1;
6224 } else if (so1->u.score < so2->u.score) {
6225 cmp = -1;
6226 } else {
6227 cmp = 0;
6228 }
6229 } else {
6230 /* Alphanumeric sorting */
6231 if (server.sort_bypattern) {
6232 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6233 /* At least one compare object is NULL */
6234 if (so1->u.cmpobj == so2->u.cmpobj)
6235 cmp = 0;
6236 else if (so1->u.cmpobj == NULL)
6237 cmp = -1;
6238 else
6239 cmp = 1;
6240 } else {
6241 /* We have both the objects, use strcoll */
6242 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6243 }
6244 } else {
6245 /* Compare elements directly */
6246 robj *dec1, *dec2;
6247
6248 dec1 = getDecodedObject(so1->obj);
6249 dec2 = getDecodedObject(so2->obj);
6250 cmp = strcoll(dec1->ptr,dec2->ptr);
6251 decrRefCount(dec1);
6252 decrRefCount(dec2);
6253 }
6254 }
6255 return server.sort_desc ? -cmp : cmp;
6256 }
6257
6258 /* The SORT command is the most complex command in Redis. Warning: this code
6259 * is optimized for speed and a bit less for readability */
6260 static void sortCommand(redisClient *c) {
6261 list *operations;
6262 int outputlen = 0;
6263 int desc = 0, alpha = 0;
6264 int limit_start = 0, limit_count = -1, start, end;
6265 int j, dontsort = 0, vectorlen;
6266 int getop = 0; /* GET operation counter */
6267 robj *sortval, *sortby = NULL, *storekey = NULL;
6268 redisSortObject *vector; /* Resulting vector to sort */
6269
6270 /* Lookup the key to sort. It must be of the right types */
6271 sortval = lookupKeyRead(c->db,c->argv[1]);
6272 if (sortval == NULL) {
6273 addReply(c,shared.nullmultibulk);
6274 return;
6275 }
6276 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6277 sortval->type != REDIS_ZSET)
6278 {
6279 addReply(c,shared.wrongtypeerr);
6280 return;
6281 }
6282
6283 /* Create a list of operations to perform for every sorted element.
6284 * Operations can be GET/DEL/INCR/DECR */
6285 operations = listCreate();
6286 listSetFreeMethod(operations,zfree);
6287 j = 2;
6288
6289 /* Now we need to protect sortval incrementing its count, in the future
6290 * SORT may have options able to overwrite/delete keys during the sorting
6291 * and the sorted key itself may get destroied */
6292 incrRefCount(sortval);
6293
6294 /* The SORT command has an SQL-alike syntax, parse it */
6295 while(j < c->argc) {
6296 int leftargs = c->argc-j-1;
6297 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6298 desc = 0;
6299 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6300 desc = 1;
6301 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6302 alpha = 1;
6303 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6304 limit_start = atoi(c->argv[j+1]->ptr);
6305 limit_count = atoi(c->argv[j+2]->ptr);
6306 j+=2;
6307 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6308 storekey = c->argv[j+1];
6309 j++;
6310 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6311 sortby = c->argv[j+1];
6312 /* If the BY pattern does not contain '*', i.e. it is constant,
6313 * we don't need to sort nor to lookup the weight keys. */
6314 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6315 j++;
6316 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6317 listAddNodeTail(operations,createSortOperation(
6318 REDIS_SORT_GET,c->argv[j+1]));
6319 getop++;
6320 j++;
6321 } else {
6322 decrRefCount(sortval);
6323 listRelease(operations);
6324 addReply(c,shared.syntaxerr);
6325 return;
6326 }
6327 j++;
6328 }
6329
6330 /* Load the sorting vector with all the objects to sort */
6331 switch(sortval->type) {
6332 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6333 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6334 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6335 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6336 }
6337 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6338 j = 0;
6339
6340 if (sortval->type == REDIS_LIST) {
6341 list *list = sortval->ptr;
6342 listNode *ln;
6343 listIter li;
6344
6345 listRewind(list,&li);
6346 while((ln = listNext(&li))) {
6347 robj *ele = ln->value;
6348 vector[j].obj = ele;
6349 vector[j].u.score = 0;
6350 vector[j].u.cmpobj = NULL;
6351 j++;
6352 }
6353 } else {
6354 dict *set;
6355 dictIterator *di;
6356 dictEntry *setele;
6357
6358 if (sortval->type == REDIS_SET) {
6359 set = sortval->ptr;
6360 } else {
6361 zset *zs = sortval->ptr;
6362 set = zs->dict;
6363 }
6364
6365 di = dictGetIterator(set);
6366 while((setele = dictNext(di)) != NULL) {
6367 vector[j].obj = dictGetEntryKey(setele);
6368 vector[j].u.score = 0;
6369 vector[j].u.cmpobj = NULL;
6370 j++;
6371 }
6372 dictReleaseIterator(di);
6373 }
6374 redisAssert(j == vectorlen);
6375
6376 /* Now it's time to load the right scores in the sorting vector */
6377 if (dontsort == 0) {
6378 for (j = 0; j < vectorlen; j++) {
6379 if (sortby) {
6380 robj *byval;
6381
6382 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6383 if (!byval || byval->type != REDIS_STRING) continue;
6384 if (alpha) {
6385 vector[j].u.cmpobj = getDecodedObject(byval);
6386 } else {
6387 if (byval->encoding == REDIS_ENCODING_RAW) {
6388 vector[j].u.score = strtod(byval->ptr,NULL);
6389 } else {
6390 /* Don't need to decode the object if it's
6391 * integer-encoded (the only encoding supported) so
6392 * far. We can just cast it */
6393 if (byval->encoding == REDIS_ENCODING_INT) {
6394 vector[j].u.score = (long)byval->ptr;
6395 } else
6396 redisAssert(1 != 1);
6397 }
6398 }
6399 } else {
6400 if (!alpha) {
6401 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6402 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6403 else {
6404 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6405 vector[j].u.score = (long) vector[j].obj->ptr;
6406 else
6407 redisAssert(1 != 1);
6408 }
6409 }
6410 }
6411 }
6412 }
6413
6414 /* We are ready to sort the vector... perform a bit of sanity check
6415 * on the LIMIT option too. We'll use a partial version of quicksort. */
6416 start = (limit_start < 0) ? 0 : limit_start;
6417 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6418 if (start >= vectorlen) {
6419 start = vectorlen-1;
6420 end = vectorlen-2;
6421 }
6422 if (end >= vectorlen) end = vectorlen-1;
6423
6424 if (dontsort == 0) {
6425 server.sort_desc = desc;
6426 server.sort_alpha = alpha;
6427 server.sort_bypattern = sortby ? 1 : 0;
6428 if (sortby && (start != 0 || end != vectorlen-1))
6429 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6430 else
6431 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6432 }
6433
6434 /* Send command output to the output buffer, performing the specified
6435 * GET/DEL/INCR/DECR operations if any. */
6436 outputlen = getop ? getop*(end-start+1) : end-start+1;
6437 if (storekey == NULL) {
6438 /* STORE option not specified, sent the sorting result to client */
6439 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6440 for (j = start; j <= end; j++) {
6441 listNode *ln;
6442 listIter li;
6443
6444 if (!getop) addReplyBulk(c,vector[j].obj);
6445 listRewind(operations,&li);
6446 while((ln = listNext(&li))) {
6447 redisSortOperation *sop = ln->value;
6448 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6449 vector[j].obj);
6450
6451 if (sop->type == REDIS_SORT_GET) {
6452 if (!val || val->type != REDIS_STRING) {
6453 addReply(c,shared.nullbulk);
6454 } else {
6455 addReplyBulk(c,val);
6456 }
6457 } else {
6458 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6459 }
6460 }
6461 }
6462 } else {
6463 robj *listObject = createListObject();
6464 list *listPtr = (list*) listObject->ptr;
6465
6466 /* STORE option specified, set the sorting result as a List object */
6467 for (j = start; j <= end; j++) {
6468 listNode *ln;
6469 listIter li;
6470
6471 if (!getop) {
6472 listAddNodeTail(listPtr,vector[j].obj);
6473 incrRefCount(vector[j].obj);
6474 }
6475 listRewind(operations,&li);
6476 while((ln = listNext(&li))) {
6477 redisSortOperation *sop = ln->value;
6478 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6479 vector[j].obj);
6480
6481 if (sop->type == REDIS_SORT_GET) {
6482 if (!val || val->type != REDIS_STRING) {
6483 listAddNodeTail(listPtr,createStringObject("",0));
6484 } else {
6485 listAddNodeTail(listPtr,val);
6486 incrRefCount(val);
6487 }
6488 } else {
6489 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6490 }
6491 }
6492 }
6493 if (dictReplace(c->db->dict,storekey,listObject)) {
6494 incrRefCount(storekey);
6495 }
6496 /* Note: we add 1 because the DB is dirty anyway since even if the
6497 * SORT result is empty a new key is set and maybe the old content
6498 * replaced. */
6499 server.dirty += 1+outputlen;
6500 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6501 }
6502
6503 /* Cleanup */
6504 decrRefCount(sortval);
6505 listRelease(operations);
6506 for (j = 0; j < vectorlen; j++) {
6507 if (sortby && alpha && vector[j].u.cmpobj)
6508 decrRefCount(vector[j].u.cmpobj);
6509 }
6510 zfree(vector);
6511 }
6512
6513 /* Convert an amount of bytes into a human readable string in the form
6514 * of 100B, 2G, 100M, 4K, and so forth. */
6515 static void bytesToHuman(char *s, unsigned long long n) {
6516 double d;
6517
6518 if (n < 1024) {
6519 /* Bytes */
6520 sprintf(s,"%lluB",n);
6521 return;
6522 } else if (n < (1024*1024)) {
6523 d = (double)n/(1024);
6524 sprintf(s,"%.2fK",d);
6525 } else if (n < (1024LL*1024*1024)) {
6526 d = (double)n/(1024*1024);
6527 sprintf(s,"%.2fM",d);
6528 } else if (n < (1024LL*1024*1024*1024)) {
6529 d = (double)n/(1024LL*1024*1024);
6530 sprintf(s,"%.2fG",d);
6531 }
6532 }
6533
6534 /* Create the string returned by the INFO command. This is decoupled
6535 * by the INFO command itself as we need to report the same information
6536 * on memory corruption problems. */
6537 static sds genRedisInfoString(void) {
6538 sds info;
6539 time_t uptime = time(NULL)-server.stat_starttime;
6540 int j;
6541 char hmem[64];
6542
6543 bytesToHuman(hmem,zmalloc_used_memory());
6544 info = sdscatprintf(sdsempty(),
6545 "redis_version:%s\r\n"
6546 "arch_bits:%s\r\n"
6547 "multiplexing_api:%s\r\n"
6548 "process_id:%ld\r\n"
6549 "uptime_in_seconds:%ld\r\n"
6550 "uptime_in_days:%ld\r\n"
6551 "connected_clients:%d\r\n"
6552 "connected_slaves:%d\r\n"
6553 "blocked_clients:%d\r\n"
6554 "used_memory:%zu\r\n"
6555 "used_memory_human:%s\r\n"
6556 "changes_since_last_save:%lld\r\n"
6557 "bgsave_in_progress:%d\r\n"
6558 "last_save_time:%ld\r\n"
6559 "bgrewriteaof_in_progress:%d\r\n"
6560 "total_connections_received:%lld\r\n"
6561 "total_commands_processed:%lld\r\n"
6562 "expired_keys:%lld\r\n"
6563 "hash_max_zipmap_entries:%ld\r\n"
6564 "hash_max_zipmap_value:%ld\r\n"
6565 "vm_enabled:%d\r\n"
6566 "role:%s\r\n"
6567 ,REDIS_VERSION,
6568 (sizeof(long) == 8) ? "64" : "32",
6569 aeGetApiName(),
6570 (long) getpid(),
6571 uptime,
6572 uptime/(3600*24),
6573 listLength(server.clients)-listLength(server.slaves),
6574 listLength(server.slaves),
6575 server.blpop_blocked_clients,
6576 zmalloc_used_memory(),
6577 hmem,
6578 server.dirty,
6579 server.bgsavechildpid != -1,
6580 server.lastsave,
6581 server.bgrewritechildpid != -1,
6582 server.stat_numconnections,
6583 server.stat_numcommands,
6584 server.stat_expiredkeys,
6585 server.hash_max_zipmap_entries,
6586 server.hash_max_zipmap_value,
6587 server.vm_enabled != 0,
6588 server.masterhost == NULL ? "master" : "slave"
6589 );
6590 if (server.masterhost) {
6591 info = sdscatprintf(info,
6592 "master_host:%s\r\n"
6593 "master_port:%d\r\n"
6594 "master_link_status:%s\r\n"
6595 "master_last_io_seconds_ago:%d\r\n"
6596 ,server.masterhost,
6597 server.masterport,
6598 (server.replstate == REDIS_REPL_CONNECTED) ?
6599 "up" : "down",
6600 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6601 );
6602 }
6603 if (server.vm_enabled) {
6604 lockThreadedIO();
6605 info = sdscatprintf(info,
6606 "vm_conf_max_memory:%llu\r\n"
6607 "vm_conf_page_size:%llu\r\n"
6608 "vm_conf_pages:%llu\r\n"
6609 "vm_stats_used_pages:%llu\r\n"
6610 "vm_stats_swapped_objects:%llu\r\n"
6611 "vm_stats_swappin_count:%llu\r\n"
6612 "vm_stats_swappout_count:%llu\r\n"
6613 "vm_stats_io_newjobs_len:%lu\r\n"
6614 "vm_stats_io_processing_len:%lu\r\n"
6615 "vm_stats_io_processed_len:%lu\r\n"
6616 "vm_stats_io_active_threads:%lu\r\n"
6617 "vm_stats_blocked_clients:%lu\r\n"
6618 ,(unsigned long long) server.vm_max_memory,
6619 (unsigned long long) server.vm_page_size,
6620 (unsigned long long) server.vm_pages,
6621 (unsigned long long) server.vm_stats_used_pages,
6622 (unsigned long long) server.vm_stats_swapped_objects,
6623 (unsigned long long) server.vm_stats_swapins,
6624 (unsigned long long) server.vm_stats_swapouts,
6625 (unsigned long) listLength(server.io_newjobs),
6626 (unsigned long) listLength(server.io_processing),
6627 (unsigned long) listLength(server.io_processed),
6628 (unsigned long) server.io_active_threads,
6629 (unsigned long) server.vm_blocked_clients
6630 );
6631 unlockThreadedIO();
6632 }
6633 for (j = 0; j < server.dbnum; j++) {
6634 long long keys, vkeys;
6635
6636 keys = dictSize(server.db[j].dict);
6637 vkeys = dictSize(server.db[j].expires);
6638 if (keys || vkeys) {
6639 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6640 j, keys, vkeys);
6641 }
6642 }
6643 return info;
6644 }
6645
6646 static void infoCommand(redisClient *c) {
6647 sds info = genRedisInfoString();
6648 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6649 (unsigned long)sdslen(info)));
6650 addReplySds(c,info);
6651 addReply(c,shared.crlf);
6652 }
6653
6654 static void monitorCommand(redisClient *c) {
6655 /* ignore MONITOR if aleady slave or in monitor mode */
6656 if (c->flags & REDIS_SLAVE) return;
6657
6658 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6659 c->slaveseldb = 0;
6660 listAddNodeTail(server.monitors,c);
6661 addReply(c,shared.ok);
6662 }
6663
6664 /* ================================= Expire ================================= */
6665 static int removeExpire(redisDb *db, robj *key) {
6666 if (dictDelete(db->expires,key) == DICT_OK) {
6667 return 1;
6668 } else {
6669 return 0;
6670 }
6671 }
6672
6673 static int setExpire(redisDb *db, robj *key, time_t when) {
6674 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6675 return 0;
6676 } else {
6677 incrRefCount(key);
6678 return 1;
6679 }
6680 }
6681
6682 /* Return the expire time of the specified key, or -1 if no expire
6683 * is associated with this key (i.e. the key is non volatile) */
6684 static time_t getExpire(redisDb *db, robj *key) {
6685 dictEntry *de;
6686
6687 /* No expire? return ASAP */
6688 if (dictSize(db->expires) == 0 ||
6689 (de = dictFind(db->expires,key)) == NULL) return -1;
6690
6691 return (time_t) dictGetEntryVal(de);
6692 }
6693
6694 static int expireIfNeeded(redisDb *db, robj *key) {
6695 time_t when;
6696 dictEntry *de;
6697
6698 /* No expire? return ASAP */
6699 if (dictSize(db->expires) == 0 ||
6700 (de = dictFind(db->expires,key)) == NULL) return 0;
6701
6702 /* Lookup the expire */
6703 when = (time_t) dictGetEntryVal(de);
6704 if (time(NULL) <= when) return 0;
6705
6706 /* Delete the key */
6707 dictDelete(db->expires,key);
6708 server.stat_expiredkeys++;
6709 return dictDelete(db->dict,key) == DICT_OK;
6710 }
6711
6712 static int deleteIfVolatile(redisDb *db, robj *key) {
6713 dictEntry *de;
6714
6715 /* No expire? return ASAP */
6716 if (dictSize(db->expires) == 0 ||
6717 (de = dictFind(db->expires,key)) == NULL) return 0;
6718
6719 /* Delete the key */
6720 server.dirty++;
6721 server.stat_expiredkeys++;
6722 dictDelete(db->expires,key);
6723 return dictDelete(db->dict,key) == DICT_OK;
6724 }
6725
6726 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6727 dictEntry *de;
6728
6729 de = dictFind(c->db->dict,key);
6730 if (de == NULL) {
6731 addReply(c,shared.czero);
6732 return;
6733 }
6734 if (seconds < 0) {
6735 if (deleteKey(c->db,key)) server.dirty++;
6736 addReply(c, shared.cone);
6737 return;
6738 } else {
6739 time_t when = time(NULL)+seconds;
6740 if (setExpire(c->db,key,when)) {
6741 addReply(c,shared.cone);
6742 server.dirty++;
6743 } else {
6744 addReply(c,shared.czero);
6745 }
6746 return;
6747 }
6748 }
6749
6750 static void expireCommand(redisClient *c) {
6751 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6752 }
6753
6754 static void expireatCommand(redisClient *c) {
6755 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6756 }
6757
6758 static void ttlCommand(redisClient *c) {
6759 time_t expire;
6760 int ttl = -1;
6761
6762 expire = getExpire(c->db,c->argv[1]);
6763 if (expire != -1) {
6764 ttl = (int) (expire-time(NULL));
6765 if (ttl < 0) ttl = -1;
6766 }
6767 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6768 }
6769
6770 /* ================================ MULTI/EXEC ============================== */
6771
6772 /* Client state initialization for MULTI/EXEC */
6773 static void initClientMultiState(redisClient *c) {
6774 c->mstate.commands = NULL;
6775 c->mstate.count = 0;
6776 }
6777
6778 /* Release all the resources associated with MULTI/EXEC state */
6779 static void freeClientMultiState(redisClient *c) {
6780 int j;
6781
6782 for (j = 0; j < c->mstate.count; j++) {
6783 int i;
6784 multiCmd *mc = c->mstate.commands+j;
6785
6786 for (i = 0; i < mc->argc; i++)
6787 decrRefCount(mc->argv[i]);
6788 zfree(mc->argv);
6789 }
6790 zfree(c->mstate.commands);
6791 }
6792
6793 /* Add a new command into the MULTI commands queue */
6794 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6795 multiCmd *mc;
6796 int j;
6797
6798 c->mstate.commands = zrealloc(c->mstate.commands,
6799 sizeof(multiCmd)*(c->mstate.count+1));
6800 mc = c->mstate.commands+c->mstate.count;
6801 mc->cmd = cmd;
6802 mc->argc = c->argc;
6803 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6804 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6805 for (j = 0; j < c->argc; j++)
6806 incrRefCount(mc->argv[j]);
6807 c->mstate.count++;
6808 }
6809
6810 static void multiCommand(redisClient *c) {
6811 c->flags |= REDIS_MULTI;
6812 addReply(c,shared.ok);
6813 }
6814
6815 static void discardCommand(redisClient *c) {
6816 if (!(c->flags & REDIS_MULTI)) {
6817 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6818 return;
6819 }
6820
6821 freeClientMultiState(c);
6822 initClientMultiState(c);
6823 c->flags &= (~REDIS_MULTI);
6824 addReply(c,shared.ok);
6825 }
6826
6827 static void execCommand(redisClient *c) {
6828 int j;
6829 robj **orig_argv;
6830 int orig_argc;
6831
6832 if (!(c->flags & REDIS_MULTI)) {
6833 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6834 return;
6835 }
6836
6837 orig_argv = c->argv;
6838 orig_argc = c->argc;
6839 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6840 for (j = 0; j < c->mstate.count; j++) {
6841 c->argc = c->mstate.commands[j].argc;
6842 c->argv = c->mstate.commands[j].argv;
6843 call(c,c->mstate.commands[j].cmd);
6844 }
6845 c->argv = orig_argv;
6846 c->argc = orig_argc;
6847 freeClientMultiState(c);
6848 initClientMultiState(c);
6849 c->flags &= (~REDIS_MULTI);
6850 }
6851
6852 /* =========================== Blocking Operations ========================= */
6853
6854 /* Currently Redis blocking operations support is limited to list POP ops,
6855 * so the current implementation is not fully generic, but it is also not
6856 * completely specific so it will not require a rewrite to support new
6857 * kind of blocking operations in the future.
6858 *
6859 * Still it's important to note that list blocking operations can be already
6860 * used as a notification mechanism in order to implement other blocking
6861 * operations at application level, so there must be a very strong evidence
6862 * of usefulness and generality before new blocking operations are implemented.
6863 *
6864 * This is how the current blocking POP works, we use BLPOP as example:
6865 * - If the user calls BLPOP and the key exists and contains a non empty list
6866 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6867 * if there is not to block.
6868 * - If instead BLPOP is called and the key does not exists or the list is
6869 * empty we need to block. In order to do so we remove the notification for
6870 * new data to read in the client socket (so that we'll not serve new
6871 * requests if the blocking request is not served). Also we put the client
6872 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6873 * blocking for this keys.
6874 * - If a PUSH operation against a key with blocked clients waiting is
6875 * performed, we serve the first in the list: basically instead to push
6876 * the new element inside the list we return it to the (first / oldest)
6877 * blocking client, unblock the client, and remove it form the list.
6878 *
6879 * The above comment and the source code should be enough in order to understand
6880 * the implementation and modify / fix it later.
6881 */
6882
6883 /* Set a client in blocking mode for the specified key, with the specified
6884 * timeout */
6885 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6886 dictEntry *de;
6887 list *l;
6888 int j;
6889
6890 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6891 c->blockingkeysnum = numkeys;
6892 c->blockingto = timeout;
6893 for (j = 0; j < numkeys; j++) {
6894 /* Add the key in the client structure, to map clients -> keys */
6895 c->blockingkeys[j] = keys[j];
6896 incrRefCount(keys[j]);
6897
6898 /* And in the other "side", to map keys -> clients */
6899 de = dictFind(c->db->blockingkeys,keys[j]);
6900 if (de == NULL) {
6901 int retval;
6902
6903 /* For every key we take a list of clients blocked for it */
6904 l = listCreate();
6905 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6906 incrRefCount(keys[j]);
6907 assert(retval == DICT_OK);
6908 } else {
6909 l = dictGetEntryVal(de);
6910 }
6911 listAddNodeTail(l,c);
6912 }
6913 /* Mark the client as a blocked client */
6914 c->flags |= REDIS_BLOCKED;
6915 server.blpop_blocked_clients++;
6916 }
6917
6918 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6919 static void unblockClientWaitingData(redisClient *c) {
6920 dictEntry *de;
6921 list *l;
6922 int j;
6923
6924 assert(c->blockingkeys != NULL);
6925 /* The client may wait for multiple keys, so unblock it for every key. */
6926 for (j = 0; j < c->blockingkeysnum; j++) {
6927 /* Remove this client from the list of clients waiting for this key. */
6928 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6929 assert(de != NULL);
6930 l = dictGetEntryVal(de);
6931 listDelNode(l,listSearchKey(l,c));
6932 /* If the list is empty we need to remove it to avoid wasting memory */
6933 if (listLength(l) == 0)
6934 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6935 decrRefCount(c->blockingkeys[j]);
6936 }
6937 /* Cleanup the client structure */
6938 zfree(c->blockingkeys);
6939 c->blockingkeys = NULL;
6940 c->flags &= (~REDIS_BLOCKED);
6941 server.blpop_blocked_clients--;
6942 /* We want to process data if there is some command waiting
6943 * in the input buffer. Note that this is safe even if
6944 * unblockClientWaitingData() gets called from freeClient() because
6945 * freeClient() will be smart enough to call this function
6946 * *after* c->querybuf was set to NULL. */
6947 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6948 }
6949
6950 /* This should be called from any function PUSHing into lists.
6951 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6952 * 'ele' is the element pushed.
6953 *
6954 * If the function returns 0 there was no client waiting for a list push
6955 * against this key.
6956 *
6957 * If the function returns 1 there was a client waiting for a list push
6958 * against this key, the element was passed to this client thus it's not
6959 * needed to actually add it to the list and the caller should return asap. */
6960 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6961 struct dictEntry *de;
6962 redisClient *receiver;
6963 list *l;
6964 listNode *ln;
6965
6966 de = dictFind(c->db->blockingkeys,key);
6967 if (de == NULL) return 0;
6968 l = dictGetEntryVal(de);
6969 ln = listFirst(l);
6970 assert(ln != NULL);
6971 receiver = ln->value;
6972
6973 addReplySds(receiver,sdsnew("*2\r\n"));
6974 addReplyBulk(receiver,key);
6975 addReplyBulk(receiver,ele);
6976 unblockClientWaitingData(receiver);
6977 return 1;
6978 }
6979
6980 /* Blocking RPOP/LPOP */
6981 static void blockingPopGenericCommand(redisClient *c, int where) {
6982 robj *o;
6983 time_t timeout;
6984 int j;
6985
6986 for (j = 1; j < c->argc-1; j++) {
6987 o = lookupKeyWrite(c->db,c->argv[j]);
6988 if (o != NULL) {
6989 if (o->type != REDIS_LIST) {
6990 addReply(c,shared.wrongtypeerr);
6991 return;
6992 } else {
6993 list *list = o->ptr;
6994 if (listLength(list) != 0) {
6995 /* If the list contains elements fall back to the usual
6996 * non-blocking POP operation */
6997 robj *argv[2], **orig_argv;
6998 int orig_argc;
6999
7000 /* We need to alter the command arguments before to call
7001 * popGenericCommand() as the command takes a single key. */
7002 orig_argv = c->argv;
7003 orig_argc = c->argc;
7004 argv[1] = c->argv[j];
7005 c->argv = argv;
7006 c->argc = 2;
7007
7008 /* Also the return value is different, we need to output
7009 * the multi bulk reply header and the key name. The
7010 * "real" command will add the last element (the value)
7011 * for us. If this souds like an hack to you it's just
7012 * because it is... */
7013 addReplySds(c,sdsnew("*2\r\n"));
7014 addReplyBulk(c,argv[1]);
7015 popGenericCommand(c,where);
7016
7017 /* Fix the client structure with the original stuff */
7018 c->argv = orig_argv;
7019 c->argc = orig_argc;
7020 return;
7021 }
7022 }
7023 }
7024 }
7025 /* If the list is empty or the key does not exists we must block */
7026 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7027 if (timeout > 0) timeout += time(NULL);
7028 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7029 }
7030
7031 static void blpopCommand(redisClient *c) {
7032 blockingPopGenericCommand(c,REDIS_HEAD);
7033 }
7034
7035 static void brpopCommand(redisClient *c) {
7036 blockingPopGenericCommand(c,REDIS_TAIL);
7037 }
7038
7039 /* =============================== Replication ============================= */
7040
7041 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7042 ssize_t nwritten, ret = size;
7043 time_t start = time(NULL);
7044
7045 timeout++;
7046 while(size) {
7047 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7048 nwritten = write(fd,ptr,size);
7049 if (nwritten == -1) return -1;
7050 ptr += nwritten;
7051 size -= nwritten;
7052 }
7053 if ((time(NULL)-start) > timeout) {
7054 errno = ETIMEDOUT;
7055 return -1;
7056 }
7057 }
7058 return ret;
7059 }
7060
7061 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7062 ssize_t nread, totread = 0;
7063 time_t start = time(NULL);
7064
7065 timeout++;
7066 while(size) {
7067 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7068 nread = read(fd,ptr,size);
7069 if (nread == -1) return -1;
7070 ptr += nread;
7071 size -= nread;
7072 totread += nread;
7073 }
7074 if ((time(NULL)-start) > timeout) {
7075 errno = ETIMEDOUT;
7076 return -1;
7077 }
7078 }
7079 return totread;
7080 }
7081
7082 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7083 ssize_t nread = 0;
7084
7085 size--;
7086 while(size) {
7087 char c;
7088
7089 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7090 if (c == '\n') {
7091 *ptr = '\0';
7092 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7093 return nread;
7094 } else {
7095 *ptr++ = c;
7096 *ptr = '\0';
7097 nread++;
7098 }
7099 }
7100 return nread;
7101 }
7102
7103 static void syncCommand(redisClient *c) {
7104 /* ignore SYNC if aleady slave or in monitor mode */
7105 if (c->flags & REDIS_SLAVE) return;
7106
7107 /* SYNC can't be issued when the server has pending data to send to
7108 * the client about already issued commands. We need a fresh reply
7109 * buffer registering the differences between the BGSAVE and the current
7110 * dataset, so that we can copy to other slaves if needed. */
7111 if (listLength(c->reply) != 0) {
7112 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7113 return;
7114 }
7115
7116 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7117 /* Here we need to check if there is a background saving operation
7118 * in progress, or if it is required to start one */
7119 if (server.bgsavechildpid != -1) {
7120 /* Ok a background save is in progress. Let's check if it is a good
7121 * one for replication, i.e. if there is another slave that is
7122 * registering differences since the server forked to save */
7123 redisClient *slave;
7124 listNode *ln;
7125 listIter li;
7126
7127 listRewind(server.slaves,&li);
7128 while((ln = listNext(&li))) {
7129 slave = ln->value;
7130 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7131 }
7132 if (ln) {
7133 /* Perfect, the server is already registering differences for
7134 * another slave. Set the right state, and copy the buffer. */
7135 listRelease(c->reply);
7136 c->reply = listDup(slave->reply);
7137 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7138 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7139 } else {
7140 /* No way, we need to wait for the next BGSAVE in order to
7141 * register differences */
7142 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7143 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7144 }
7145 } else {
7146 /* Ok we don't have a BGSAVE in progress, let's start one */
7147 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7148 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7149 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7150 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7151 return;
7152 }
7153 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7154 }
7155 c->repldbfd = -1;
7156 c->flags |= REDIS_SLAVE;
7157 c->slaveseldb = 0;
7158 listAddNodeTail(server.slaves,c);
7159 return;
7160 }
7161
7162 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7163 redisClient *slave = privdata;
7164 REDIS_NOTUSED(el);
7165 REDIS_NOTUSED(mask);
7166 char buf[REDIS_IOBUF_LEN];
7167 ssize_t nwritten, buflen;
7168
7169 if (slave->repldboff == 0) {
7170 /* Write the bulk write count before to transfer the DB. In theory here
7171 * we don't know how much room there is in the output buffer of the
7172 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7173 * operations) will never be smaller than the few bytes we need. */
7174 sds bulkcount;
7175
7176 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7177 slave->repldbsize);
7178 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7179 {
7180 sdsfree(bulkcount);
7181 freeClient(slave);
7182 return;
7183 }
7184 sdsfree(bulkcount);
7185 }
7186 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7187 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7188 if (buflen <= 0) {
7189 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7190 (buflen == 0) ? "premature EOF" : strerror(errno));
7191 freeClient(slave);
7192 return;
7193 }
7194 if ((nwritten = write(fd,buf,buflen)) == -1) {
7195 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7196 strerror(errno));
7197 freeClient(slave);
7198 return;
7199 }
7200 slave->repldboff += nwritten;
7201 if (slave->repldboff == slave->repldbsize) {
7202 close(slave->repldbfd);
7203 slave->repldbfd = -1;
7204 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7205 slave->replstate = REDIS_REPL_ONLINE;
7206 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7207 sendReplyToClient, slave) == AE_ERR) {
7208 freeClient(slave);
7209 return;
7210 }
7211 addReplySds(slave,sdsempty());
7212 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7213 }
7214 }
7215
7216 /* This function is called at the end of every backgrond saving.
7217 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7218 * otherwise REDIS_ERR is passed to the function.
7219 *
7220 * The goal of this function is to handle slaves waiting for a successful
7221 * background saving in order to perform non-blocking synchronization. */
7222 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7223 listNode *ln;
7224 int startbgsave = 0;
7225 listIter li;
7226
7227 listRewind(server.slaves,&li);
7228 while((ln = listNext(&li))) {
7229 redisClient *slave = ln->value;
7230
7231 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7232 startbgsave = 1;
7233 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7234 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7235 struct redis_stat buf;
7236
7237 if (bgsaveerr != REDIS_OK) {
7238 freeClient(slave);
7239 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7240 continue;
7241 }
7242 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7243 redis_fstat(slave->repldbfd,&buf) == -1) {
7244 freeClient(slave);
7245 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7246 continue;
7247 }
7248 slave->repldboff = 0;
7249 slave->repldbsize = buf.st_size;
7250 slave->replstate = REDIS_REPL_SEND_BULK;
7251 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7252 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7253 freeClient(slave);
7254 continue;
7255 }
7256 }
7257 }
7258 if (startbgsave) {
7259 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7260 listIter li;
7261
7262 listRewind(server.slaves,&li);
7263 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7264 while((ln = listNext(&li))) {
7265 redisClient *slave = ln->value;
7266
7267 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7268 freeClient(slave);
7269 }
7270 }
7271 }
7272 }
7273
7274 static int syncWithMaster(void) {
7275 char buf[1024], tmpfile[256], authcmd[1024];
7276 long dumpsize;
7277 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7278 int dfd, maxtries = 5;
7279
7280 if (fd == -1) {
7281 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7282 strerror(errno));
7283 return REDIS_ERR;
7284 }
7285
7286 /* AUTH with the master if required. */
7287 if(server.masterauth) {
7288 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7289 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7290 close(fd);
7291 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7292 strerror(errno));
7293 return REDIS_ERR;
7294 }
7295 /* Read the AUTH result. */
7296 if (syncReadLine(fd,buf,1024,3600) == -1) {
7297 close(fd);
7298 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7299 strerror(errno));
7300 return REDIS_ERR;
7301 }
7302 if (buf[0] != '+') {
7303 close(fd);
7304 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7305 return REDIS_ERR;
7306 }
7307 }
7308
7309 /* Issue the SYNC command */
7310 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7311 close(fd);
7312 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7313 strerror(errno));
7314 return REDIS_ERR;
7315 }
7316 /* Read the bulk write count */
7317 if (syncReadLine(fd,buf,1024,3600) == -1) {
7318 close(fd);
7319 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7320 strerror(errno));
7321 return REDIS_ERR;
7322 }
7323 if (buf[0] != '$') {
7324 close(fd);
7325 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7326 return REDIS_ERR;
7327 }
7328 dumpsize = strtol(buf+1,NULL,10);
7329 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7330 /* Read the bulk write data on a temp file */
7331 while(maxtries--) {
7332 snprintf(tmpfile,256,
7333 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7334 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7335 if (dfd != -1) break;
7336 sleep(1);
7337 }
7338 if (dfd == -1) {
7339 close(fd);
7340 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7341 return REDIS_ERR;
7342 }
7343 while(dumpsize) {
7344 int nread, nwritten;
7345
7346 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7347 if (nread == -1) {
7348 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7349 strerror(errno));
7350 close(fd);
7351 close(dfd);
7352 return REDIS_ERR;
7353 }
7354 nwritten = write(dfd,buf,nread);
7355 if (nwritten == -1) {
7356 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7357 close(fd);
7358 close(dfd);
7359 return REDIS_ERR;
7360 }
7361 dumpsize -= nread;
7362 }
7363 close(dfd);
7364 if (rename(tmpfile,server.dbfilename) == -1) {
7365 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7366 unlink(tmpfile);
7367 close(fd);
7368 return REDIS_ERR;
7369 }
7370 emptyDb();
7371 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7372 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7373 close(fd);
7374 return REDIS_ERR;
7375 }
7376 server.master = createClient(fd);
7377 server.master->flags |= REDIS_MASTER;
7378 server.master->authenticated = 1;
7379 server.replstate = REDIS_REPL_CONNECTED;
7380 return REDIS_OK;
7381 }
7382
7383 static void slaveofCommand(redisClient *c) {
7384 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7385 !strcasecmp(c->argv[2]->ptr,"one")) {
7386 if (server.masterhost) {
7387 sdsfree(server.masterhost);
7388 server.masterhost = NULL;
7389 if (server.master) freeClient(server.master);
7390 server.replstate = REDIS_REPL_NONE;
7391 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7392 }
7393 } else {
7394 sdsfree(server.masterhost);
7395 server.masterhost = sdsdup(c->argv[1]->ptr);
7396 server.masterport = atoi(c->argv[2]->ptr);
7397 if (server.master) freeClient(server.master);
7398 server.replstate = REDIS_REPL_CONNECT;
7399 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7400 server.masterhost, server.masterport);
7401 }
7402 addReply(c,shared.ok);
7403 }
7404
7405 /* ============================ Maxmemory directive ======================== */
7406
7407 /* Try to free one object form the pre-allocated objects free list.
7408 * This is useful under low mem conditions as by default we take 1 million
7409 * free objects allocated. On success REDIS_OK is returned, otherwise
7410 * REDIS_ERR. */
7411 static int tryFreeOneObjectFromFreelist(void) {
7412 robj *o;
7413
7414 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7415 if (listLength(server.objfreelist)) {
7416 listNode *head = listFirst(server.objfreelist);
7417 o = listNodeValue(head);
7418 listDelNode(server.objfreelist,head);
7419 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7420 zfree(o);
7421 return REDIS_OK;
7422 } else {
7423 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7424 return REDIS_ERR;
7425 }
7426 }
7427
7428 /* This function gets called when 'maxmemory' is set on the config file to limit
7429 * the max memory used by the server, and we are out of memory.
7430 * This function will try to, in order:
7431 *
7432 * - Free objects from the free list
7433 * - Try to remove keys with an EXPIRE set
7434 *
7435 * It is not possible to free enough memory to reach used-memory < maxmemory
7436 * the server will start refusing commands that will enlarge even more the
7437 * memory usage.
7438 */
7439 static void freeMemoryIfNeeded(void) {
7440 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7441 int j, k, freed = 0;
7442
7443 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7444 for (j = 0; j < server.dbnum; j++) {
7445 int minttl = -1;
7446 robj *minkey = NULL;
7447 struct dictEntry *de;
7448
7449 if (dictSize(server.db[j].expires)) {
7450 freed = 1;
7451 /* From a sample of three keys drop the one nearest to
7452 * the natural expire */
7453 for (k = 0; k < 3; k++) {
7454 time_t t;
7455
7456 de = dictGetRandomKey(server.db[j].expires);
7457 t = (time_t) dictGetEntryVal(de);
7458 if (minttl == -1 || t < minttl) {
7459 minkey = dictGetEntryKey(de);
7460 minttl = t;
7461 }
7462 }
7463 deleteKey(server.db+j,minkey);
7464 }
7465 }
7466 if (!freed) return; /* nothing to free... */
7467 }
7468 }
7469
7470 /* ============================== Append Only file ========================== */
7471
7472 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7473 sds buf = sdsempty();
7474 int j;
7475 ssize_t nwritten;
7476 time_t now;
7477 robj *tmpargv[3];
7478
7479 /* The DB this command was targetting is not the same as the last command
7480 * we appendend. To issue a SELECT command is needed. */
7481 if (dictid != server.appendseldb) {
7482 char seldb[64];
7483
7484 snprintf(seldb,sizeof(seldb),"%d",dictid);
7485 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7486 (unsigned long)strlen(seldb),seldb);
7487 server.appendseldb = dictid;
7488 }
7489
7490 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7491 * EXPIREs into EXPIREATs calls */
7492 if (cmd->proc == expireCommand) {
7493 long when;
7494
7495 tmpargv[0] = createStringObject("EXPIREAT",8);
7496 tmpargv[1] = argv[1];
7497 incrRefCount(argv[1]);
7498 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7499 tmpargv[2] = createObject(REDIS_STRING,
7500 sdscatprintf(sdsempty(),"%ld",when));
7501 argv = tmpargv;
7502 }
7503
7504 /* Append the actual command */
7505 buf = sdscatprintf(buf,"*%d\r\n",argc);
7506 for (j = 0; j < argc; j++) {
7507 robj *o = argv[j];
7508
7509 o = getDecodedObject(o);
7510 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7511 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7512 buf = sdscatlen(buf,"\r\n",2);
7513 decrRefCount(o);
7514 }
7515
7516 /* Free the objects from the modified argv for EXPIREAT */
7517 if (cmd->proc == expireCommand) {
7518 for (j = 0; j < 3; j++)
7519 decrRefCount(argv[j]);
7520 }
7521
7522 /* We want to perform a single write. This should be guaranteed atomic
7523 * at least if the filesystem we are writing is a real physical one.
7524 * While this will save us against the server being killed I don't think
7525 * there is much to do about the whole server stopping for power problems
7526 * or alike */
7527 nwritten = write(server.appendfd,buf,sdslen(buf));
7528 if (nwritten != (signed)sdslen(buf)) {
7529 /* Ooops, we are in troubles. The best thing to do for now is
7530 * to simply exit instead to give the illusion that everything is
7531 * working as expected. */
7532 if (nwritten == -1) {
7533 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7534 } else {
7535 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7536 }
7537 exit(1);
7538 }
7539 /* If a background append only file rewriting is in progress we want to
7540 * accumulate the differences between the child DB and the current one
7541 * in a buffer, so that when the child process will do its work we
7542 * can append the differences to the new append only file. */
7543 if (server.bgrewritechildpid != -1)
7544 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7545
7546 sdsfree(buf);
7547 now = time(NULL);
7548 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7549 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7550 now-server.lastfsync > 1))
7551 {
7552 fsync(server.appendfd); /* Let's try to get this data on the disk */
7553 server.lastfsync = now;
7554 }
7555 }
7556
7557 /* In Redis commands are always executed in the context of a client, so in
7558 * order to load the append only file we need to create a fake client. */
7559 static struct redisClient *createFakeClient(void) {
7560 struct redisClient *c = zmalloc(sizeof(*c));
7561
7562 selectDb(c,0);
7563 c->fd = -1;
7564 c->querybuf = sdsempty();
7565 c->argc = 0;
7566 c->argv = NULL;
7567 c->flags = 0;
7568 /* We set the fake client as a slave waiting for the synchronization
7569 * so that Redis will not try to send replies to this client. */
7570 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7571 c->reply = listCreate();
7572 listSetFreeMethod(c->reply,decrRefCount);
7573 listSetDupMethod(c->reply,dupClientReplyValue);
7574 return c;
7575 }
7576
7577 static void freeFakeClient(struct redisClient *c) {
7578 sdsfree(c->querybuf);
7579 listRelease(c->reply);
7580 zfree(c);
7581 }
7582
7583 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7584 * error (the append only file is zero-length) REDIS_ERR is returned. On
7585 * fatal error an error message is logged and the program exists. */
7586 int loadAppendOnlyFile(char *filename) {
7587 struct redisClient *fakeClient;
7588 FILE *fp = fopen(filename,"r");
7589 struct redis_stat sb;
7590 unsigned long long loadedkeys = 0;
7591
7592 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7593 return REDIS_ERR;
7594
7595 if (fp == NULL) {
7596 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7597 exit(1);
7598 }
7599
7600 fakeClient = createFakeClient();
7601 while(1) {
7602 int argc, j;
7603 unsigned long len;
7604 robj **argv;
7605 char buf[128];
7606 sds argsds;
7607 struct redisCommand *cmd;
7608
7609 if (fgets(buf,sizeof(buf),fp) == NULL) {
7610 if (feof(fp))
7611 break;
7612 else
7613 goto readerr;
7614 }
7615 if (buf[0] != '*') goto fmterr;
7616 argc = atoi(buf+1);
7617 argv = zmalloc(sizeof(robj*)*argc);
7618 for (j = 0; j < argc; j++) {
7619 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7620 if (buf[0] != '$') goto fmterr;
7621 len = strtol(buf+1,NULL,10);
7622 argsds = sdsnewlen(NULL,len);
7623 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7624 argv[j] = createObject(REDIS_STRING,argsds);
7625 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7626 }
7627
7628 /* Command lookup */
7629 cmd = lookupCommand(argv[0]->ptr);
7630 if (!cmd) {
7631 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7632 exit(1);
7633 }
7634 /* Try object sharing and encoding */
7635 if (server.shareobjects) {
7636 int j;
7637 for(j = 1; j < argc; j++)
7638 argv[j] = tryObjectSharing(argv[j]);
7639 }
7640 if (cmd->flags & REDIS_CMD_BULK)
7641 tryObjectEncoding(argv[argc-1]);
7642 /* Run the command in the context of a fake client */
7643 fakeClient->argc = argc;
7644 fakeClient->argv = argv;
7645 cmd->proc(fakeClient);
7646 /* Discard the reply objects list from the fake client */
7647 while(listLength(fakeClient->reply))
7648 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7649 /* Clean up, ready for the next command */
7650 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7651 zfree(argv);
7652 /* Handle swapping while loading big datasets when VM is on */
7653 loadedkeys++;
7654 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7655 while (zmalloc_used_memory() > server.vm_max_memory) {
7656 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7657 }
7658 }
7659 }
7660 fclose(fp);
7661 freeFakeClient(fakeClient);
7662 return REDIS_OK;
7663
7664 readerr:
7665 if (feof(fp)) {
7666 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7667 } else {
7668 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7669 }
7670 exit(1);
7671 fmterr:
7672 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7673 exit(1);
7674 }
7675
7676 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7677 static int fwriteBulkObject(FILE *fp, robj *obj) {
7678 char buf[128];
7679 int decrrc = 0;
7680
7681 /* Avoid the incr/decr ref count business if possible to help
7682 * copy-on-write (we are often in a child process when this function
7683 * is called).
7684 * Also makes sure that key objects don't get incrRefCount-ed when VM
7685 * is enabled */
7686 if (obj->encoding != REDIS_ENCODING_RAW) {
7687 obj = getDecodedObject(obj);
7688 decrrc = 1;
7689 }
7690 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7691 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7692 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7693 goto err;
7694 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7695 if (decrrc) decrRefCount(obj);
7696 return 1;
7697 err:
7698 if (decrrc) decrRefCount(obj);
7699 return 0;
7700 }
7701
7702 /* Write binary-safe string into a file in the bulkformat
7703 * $<count>\r\n<payload>\r\n */
7704 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7705 char buf[128];
7706
7707 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7708 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7709 if (len && fwrite(s,len,1,fp) == 0) return 0;
7710 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7711 return 1;
7712 }
7713
7714 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7715 static int fwriteBulkDouble(FILE *fp, double d) {
7716 char buf[128], dbuf[128];
7717
7718 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7719 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7720 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7721 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7722 return 1;
7723 }
7724
7725 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7726 static int fwriteBulkLong(FILE *fp, long l) {
7727 char buf[128], lbuf[128];
7728
7729 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7730 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7731 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7732 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7733 return 1;
7734 }
7735
7736 /* Write a sequence of commands able to fully rebuild the dataset into
7737 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7738 static int rewriteAppendOnlyFile(char *filename) {
7739 dictIterator *di = NULL;
7740 dictEntry *de;
7741 FILE *fp;
7742 char tmpfile[256];
7743 int j;
7744 time_t now = time(NULL);
7745
7746 /* Note that we have to use a different temp name here compared to the
7747 * one used by rewriteAppendOnlyFileBackground() function. */
7748 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7749 fp = fopen(tmpfile,"w");
7750 if (!fp) {
7751 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7752 return REDIS_ERR;
7753 }
7754 for (j = 0; j < server.dbnum; j++) {
7755 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7756 redisDb *db = server.db+j;
7757 dict *d = db->dict;
7758 if (dictSize(d) == 0) continue;
7759 di = dictGetIterator(d);
7760 if (!di) {
7761 fclose(fp);
7762 return REDIS_ERR;
7763 }
7764
7765 /* SELECT the new DB */
7766 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7767 if (fwriteBulkLong(fp,j) == 0) goto werr;
7768
7769 /* Iterate this DB writing every entry */
7770 while((de = dictNext(di)) != NULL) {
7771 robj *key, *o;
7772 time_t expiretime;
7773 int swapped;
7774
7775 key = dictGetEntryKey(de);
7776 /* If the value for this key is swapped, load a preview in memory.
7777 * We use a "swapped" flag to remember if we need to free the
7778 * value object instead to just increment the ref count anyway
7779 * in order to avoid copy-on-write of pages if we are forked() */
7780 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7781 key->storage == REDIS_VM_SWAPPING) {
7782 o = dictGetEntryVal(de);
7783 swapped = 0;
7784 } else {
7785 o = vmPreviewObject(key);
7786 swapped = 1;
7787 }
7788 expiretime = getExpire(db,key);
7789
7790 /* Save the key and associated value */
7791 if (o->type == REDIS_STRING) {
7792 /* Emit a SET command */
7793 char cmd[]="*3\r\n$3\r\nSET\r\n";
7794 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7795 /* Key and value */
7796 if (fwriteBulkObject(fp,key) == 0) goto werr;
7797 if (fwriteBulkObject(fp,o) == 0) goto werr;
7798 } else if (o->type == REDIS_LIST) {
7799 /* Emit the RPUSHes needed to rebuild the list */
7800 list *list = o->ptr;
7801 listNode *ln;
7802 listIter li;
7803
7804 listRewind(list,&li);
7805 while((ln = listNext(&li))) {
7806 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7807 robj *eleobj = listNodeValue(ln);
7808
7809 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7810 if (fwriteBulkObject(fp,key) == 0) goto werr;
7811 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7812 }
7813 } else if (o->type == REDIS_SET) {
7814 /* Emit the SADDs needed to rebuild the set */
7815 dict *set = o->ptr;
7816 dictIterator *di = dictGetIterator(set);
7817 dictEntry *de;
7818
7819 while((de = dictNext(di)) != NULL) {
7820 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7821 robj *eleobj = dictGetEntryKey(de);
7822
7823 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7824 if (fwriteBulkObject(fp,key) == 0) goto werr;
7825 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7826 }
7827 dictReleaseIterator(di);
7828 } else if (o->type == REDIS_ZSET) {
7829 /* Emit the ZADDs needed to rebuild the sorted set */
7830 zset *zs = o->ptr;
7831 dictIterator *di = dictGetIterator(zs->dict);
7832 dictEntry *de;
7833
7834 while((de = dictNext(di)) != NULL) {
7835 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7836 robj *eleobj = dictGetEntryKey(de);
7837 double *score = dictGetEntryVal(de);
7838
7839 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7840 if (fwriteBulkObject(fp,key) == 0) goto werr;
7841 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7842 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7843 }
7844 dictReleaseIterator(di);
7845 } else if (o->type == REDIS_HASH) {
7846 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7847
7848 /* Emit the HSETs needed to rebuild the hash */
7849 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7850 unsigned char *p = zipmapRewind(o->ptr);
7851 unsigned char *field, *val;
7852 unsigned int flen, vlen;
7853
7854 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7855 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7856 if (fwriteBulkObject(fp,key) == 0) goto werr;
7857 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7858 return -1;
7859 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7860 return -1;
7861 }
7862 } else {
7863 dictIterator *di = dictGetIterator(o->ptr);
7864 dictEntry *de;
7865
7866 while((de = dictNext(di)) != NULL) {
7867 robj *field = dictGetEntryKey(de);
7868 robj *val = dictGetEntryVal(de);
7869
7870 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7871 if (fwriteBulkObject(fp,key) == 0) goto werr;
7872 if (fwriteBulkObject(fp,field) == -1) return -1;
7873 if (fwriteBulkObject(fp,val) == -1) return -1;
7874 }
7875 dictReleaseIterator(di);
7876 }
7877 } else {
7878 redisAssert(0);
7879 }
7880 /* Save the expire time */
7881 if (expiretime != -1) {
7882 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7883 /* If this key is already expired skip it */
7884 if (expiretime < now) continue;
7885 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7886 if (fwriteBulkObject(fp,key) == 0) goto werr;
7887 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7888 }
7889 if (swapped) decrRefCount(o);
7890 }
7891 dictReleaseIterator(di);
7892 }
7893
7894 /* Make sure data will not remain on the OS's output buffers */
7895 fflush(fp);
7896 fsync(fileno(fp));
7897 fclose(fp);
7898
7899 /* Use RENAME to make sure the DB file is changed atomically only
7900 * if the generate DB file is ok. */
7901 if (rename(tmpfile,filename) == -1) {
7902 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7903 unlink(tmpfile);
7904 return REDIS_ERR;
7905 }
7906 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7907 return REDIS_OK;
7908
7909 werr:
7910 fclose(fp);
7911 unlink(tmpfile);
7912 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7913 if (di) dictReleaseIterator(di);
7914 return REDIS_ERR;
7915 }
7916
7917 /* This is how rewriting of the append only file in background works:
7918 *
7919 * 1) The user calls BGREWRITEAOF
7920 * 2) Redis calls this function, that forks():
7921 * 2a) the child rewrite the append only file in a temp file.
7922 * 2b) the parent accumulates differences in server.bgrewritebuf.
7923 * 3) When the child finished '2a' exists.
7924 * 4) The parent will trap the exit code, if it's OK, will append the
7925 * data accumulated into server.bgrewritebuf into the temp file, and
7926 * finally will rename(2) the temp file in the actual file name.
7927 * The the new file is reopened as the new append only file. Profit!
7928 */
7929 static int rewriteAppendOnlyFileBackground(void) {
7930 pid_t childpid;
7931
7932 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7933 if (server.vm_enabled) waitEmptyIOJobsQueue();
7934 if ((childpid = fork()) == 0) {
7935 /* Child */
7936 char tmpfile[256];
7937
7938 if (server.vm_enabled) vmReopenSwapFile();
7939 close(server.fd);
7940 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7941 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7942 _exit(0);
7943 } else {
7944 _exit(1);
7945 }
7946 } else {
7947 /* Parent */
7948 if (childpid == -1) {
7949 redisLog(REDIS_WARNING,
7950 "Can't rewrite append only file in background: fork: %s",
7951 strerror(errno));
7952 return REDIS_ERR;
7953 }
7954 redisLog(REDIS_NOTICE,
7955 "Background append only file rewriting started by pid %d",childpid);
7956 server.bgrewritechildpid = childpid;
7957 /* We set appendseldb to -1 in order to force the next call to the
7958 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7959 * accumulated by the parent into server.bgrewritebuf will start
7960 * with a SELECT statement and it will be safe to merge. */
7961 server.appendseldb = -1;
7962 return REDIS_OK;
7963 }
7964 return REDIS_OK; /* unreached */
7965 }
7966
7967 static void bgrewriteaofCommand(redisClient *c) {
7968 if (server.bgrewritechildpid != -1) {
7969 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7970 return;
7971 }
7972 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7973 char *status = "+Background append only file rewriting started\r\n";
7974 addReplySds(c,sdsnew(status));
7975 } else {
7976 addReply(c,shared.err);
7977 }
7978 }
7979
7980 static void aofRemoveTempFile(pid_t childpid) {
7981 char tmpfile[256];
7982
7983 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7984 unlink(tmpfile);
7985 }
7986
7987 /* Virtual Memory is composed mainly of two subsystems:
7988 * - Blocking Virutal Memory
7989 * - Threaded Virtual Memory I/O
7990 * The two parts are not fully decoupled, but functions are split among two
7991 * different sections of the source code (delimited by comments) in order to
7992 * make more clear what functionality is about the blocking VM and what about
7993 * the threaded (not blocking) VM.
7994 *
7995 * Redis VM design:
7996 *
7997 * Redis VM is a blocking VM (one that blocks reading swapped values from
7998 * disk into memory when a value swapped out is needed in memory) that is made
7999 * unblocking by trying to examine the command argument vector in order to
8000 * load in background values that will likely be needed in order to exec
8001 * the command. The command is executed only once all the relevant keys
8002 * are loaded into memory.
8003 *
8004 * This basically is almost as simple of a blocking VM, but almost as parallel
8005 * as a fully non-blocking VM.
8006 */
8007
8008 /* =================== Virtual Memory - Blocking Side ====================== */
8009
8010 /* substitute the first occurrence of '%p' with the process pid in the
8011 * swap file name. */
8012 static void expandVmSwapFilename(void) {
8013 char *p = strstr(server.vm_swap_file,"%p");
8014 sds new;
8015
8016 if (!p) return;
8017 new = sdsempty();
8018 *p = '\0';
8019 new = sdscat(new,server.vm_swap_file);
8020 new = sdscatprintf(new,"%ld",(long) getpid());
8021 new = sdscat(new,p+2);
8022 zfree(server.vm_swap_file);
8023 server.vm_swap_file = new;
8024 }
8025
8026 static void vmInit(void) {
8027 off_t totsize;
8028 int pipefds[2];
8029 size_t stacksize;
8030
8031 if (server.vm_max_threads != 0)
8032 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8033
8034 expandVmSwapFilename();
8035 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8036 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8037 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8038 }
8039 if (server.vm_fp == NULL) {
8040 redisLog(REDIS_WARNING,
8041 "Impossible to open the swap file: %s. Exiting.",
8042 strerror(errno));
8043 exit(1);
8044 }
8045 server.vm_fd = fileno(server.vm_fp);
8046 server.vm_next_page = 0;
8047 server.vm_near_pages = 0;
8048 server.vm_stats_used_pages = 0;
8049 server.vm_stats_swapped_objects = 0;
8050 server.vm_stats_swapouts = 0;
8051 server.vm_stats_swapins = 0;
8052 totsize = server.vm_pages*server.vm_page_size;
8053 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8054 if (ftruncate(server.vm_fd,totsize) == -1) {
8055 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8056 strerror(errno));
8057 exit(1);
8058 } else {
8059 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8060 }
8061 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8062 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8063 (long long) (server.vm_pages+7)/8, server.vm_pages);
8064 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8065
8066 /* Initialize threaded I/O (used by Virtual Memory) */
8067 server.io_newjobs = listCreate();
8068 server.io_processing = listCreate();
8069 server.io_processed = listCreate();
8070 server.io_ready_clients = listCreate();
8071 pthread_mutex_init(&server.io_mutex,NULL);
8072 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8073 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8074 server.io_active_threads = 0;
8075 if (pipe(pipefds) == -1) {
8076 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8077 ,strerror(errno));
8078 exit(1);
8079 }
8080 server.io_ready_pipe_read = pipefds[0];
8081 server.io_ready_pipe_write = pipefds[1];
8082 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8083 /* LZF requires a lot of stack */
8084 pthread_attr_init(&server.io_threads_attr);
8085 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8086 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8087 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8088 /* Listen for events in the threaded I/O pipe */
8089 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8090 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8091 oom("creating file event");
8092 }
8093
8094 /* Mark the page as used */
8095 static void vmMarkPageUsed(off_t page) {
8096 off_t byte = page/8;
8097 int bit = page&7;
8098 redisAssert(vmFreePage(page) == 1);
8099 server.vm_bitmap[byte] |= 1<<bit;
8100 }
8101
8102 /* Mark N contiguous pages as used, with 'page' being the first. */
8103 static void vmMarkPagesUsed(off_t page, off_t count) {
8104 off_t j;
8105
8106 for (j = 0; j < count; j++)
8107 vmMarkPageUsed(page+j);
8108 server.vm_stats_used_pages += count;
8109 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8110 (long long)count, (long long)page);
8111 }
8112
8113 /* Mark the page as free */
8114 static void vmMarkPageFree(off_t page) {
8115 off_t byte = page/8;
8116 int bit = page&7;
8117 redisAssert(vmFreePage(page) == 0);
8118 server.vm_bitmap[byte] &= ~(1<<bit);
8119 }
8120
8121 /* Mark N contiguous pages as free, with 'page' being the first. */
8122 static void vmMarkPagesFree(off_t page, off_t count) {
8123 off_t j;
8124
8125 for (j = 0; j < count; j++)
8126 vmMarkPageFree(page+j);
8127 server.vm_stats_used_pages -= count;
8128 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8129 (long long)count, (long long)page);
8130 }
8131
8132 /* Test if the page is free */
8133 static int vmFreePage(off_t page) {
8134 off_t byte = page/8;
8135 int bit = page&7;
8136 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8137 }
8138
8139 /* Find N contiguous free pages storing the first page of the cluster in *first.
8140 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8141 * REDIS_ERR is returned.
8142 *
8143 * This function uses a simple algorithm: we try to allocate
8144 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8145 * again from the start of the swap file searching for free spaces.
8146 *
8147 * If it looks pretty clear that there are no free pages near our offset
8148 * we try to find less populated places doing a forward jump of
8149 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8150 * without hurry, and then we jump again and so forth...
8151 *
8152 * This function can be improved using a free list to avoid to guess
8153 * too much, since we could collect data about freed pages.
8154 *
8155 * note: I implemented this function just after watching an episode of
8156 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8157 */
8158 static int vmFindContiguousPages(off_t *first, off_t n) {
8159 off_t base, offset = 0, since_jump = 0, numfree = 0;
8160
8161 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8162 server.vm_near_pages = 0;
8163 server.vm_next_page = 0;
8164 }
8165 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8166 base = server.vm_next_page;
8167
8168 while(offset < server.vm_pages) {
8169 off_t this = base+offset;
8170
8171 /* If we overflow, restart from page zero */
8172 if (this >= server.vm_pages) {
8173 this -= server.vm_pages;
8174 if (this == 0) {
8175 /* Just overflowed, what we found on tail is no longer
8176 * interesting, as it's no longer contiguous. */
8177 numfree = 0;
8178 }
8179 }
8180 if (vmFreePage(this)) {
8181 /* This is a free page */
8182 numfree++;
8183 /* Already got N free pages? Return to the caller, with success */
8184 if (numfree == n) {
8185 *first = this-(n-1);
8186 server.vm_next_page = this+1;
8187 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8188 return REDIS_OK;
8189 }
8190 } else {
8191 /* The current one is not a free page */
8192 numfree = 0;
8193 }
8194
8195 /* Fast-forward if the current page is not free and we already
8196 * searched enough near this place. */
8197 since_jump++;
8198 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8199 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8200 since_jump = 0;
8201 /* Note that even if we rewind after the jump, we are don't need
8202 * to make sure numfree is set to zero as we only jump *if* it
8203 * is set to zero. */
8204 } else {
8205 /* Otherwise just check the next page */
8206 offset++;
8207 }
8208 }
8209 return REDIS_ERR;
8210 }
8211
8212 /* Write the specified object at the specified page of the swap file */
8213 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8214 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8215 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8216 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8217 redisLog(REDIS_WARNING,
8218 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8219 strerror(errno));
8220 return REDIS_ERR;
8221 }
8222 rdbSaveObject(server.vm_fp,o);
8223 fflush(server.vm_fp);
8224 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8225 return REDIS_OK;
8226 }
8227
8228 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8229 * needed to later retrieve the object into the key object.
8230 * If we can't find enough contiguous empty pages to swap the object on disk
8231 * REDIS_ERR is returned. */
8232 static int vmSwapObjectBlocking(robj *key, robj *val) {
8233 off_t pages = rdbSavedObjectPages(val,NULL);
8234 off_t page;
8235
8236 assert(key->storage == REDIS_VM_MEMORY);
8237 assert(key->refcount == 1);
8238 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8239 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8240 key->vm.page = page;
8241 key->vm.usedpages = pages;
8242 key->storage = REDIS_VM_SWAPPED;
8243 key->vtype = val->type;
8244 decrRefCount(val); /* Deallocate the object from memory. */
8245 vmMarkPagesUsed(page,pages);
8246 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8247 (unsigned char*) key->ptr,
8248 (unsigned long long) page, (unsigned long long) pages);
8249 server.vm_stats_swapped_objects++;
8250 server.vm_stats_swapouts++;
8251 return REDIS_OK;
8252 }
8253
8254 static robj *vmReadObjectFromSwap(off_t page, int type) {
8255 robj *o;
8256
8257 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8258 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8259 redisLog(REDIS_WARNING,
8260 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8261 strerror(errno));
8262 _exit(1);
8263 }
8264 o = rdbLoadObject(type,server.vm_fp);
8265 if (o == NULL) {
8266 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8267 _exit(1);
8268 }
8269 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8270 return o;
8271 }
8272
8273 /* Load the value object relative to the 'key' object from swap to memory.
8274 * The newly allocated object is returned.
8275 *
8276 * If preview is true the unserialized object is returned to the caller but
8277 * no changes are made to the key object, nor the pages are marked as freed */
8278 static robj *vmGenericLoadObject(robj *key, int preview) {
8279 robj *val;
8280
8281 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8282 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8283 if (!preview) {
8284 key->storage = REDIS_VM_MEMORY;
8285 key->vm.atime = server.unixtime;
8286 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8287 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8288 (unsigned char*) key->ptr);
8289 server.vm_stats_swapped_objects--;
8290 } else {
8291 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8292 (unsigned char*) key->ptr);
8293 }
8294 server.vm_stats_swapins++;
8295 return val;
8296 }
8297
8298 /* Plain object loading, from swap to memory */
8299 static robj *vmLoadObject(robj *key) {
8300 /* If we are loading the object in background, stop it, we
8301 * need to load this object synchronously ASAP. */
8302 if (key->storage == REDIS_VM_LOADING)
8303 vmCancelThreadedIOJob(key);
8304 return vmGenericLoadObject(key,0);
8305 }
8306
8307 /* Just load the value on disk, without to modify the key.
8308 * This is useful when we want to perform some operation on the value
8309 * without to really bring it from swap to memory, like while saving the
8310 * dataset or rewriting the append only log. */
8311 static robj *vmPreviewObject(robj *key) {
8312 return vmGenericLoadObject(key,1);
8313 }
8314
8315 /* How a good candidate is this object for swapping?
8316 * The better candidate it is, the greater the returned value.
8317 *
8318 * Currently we try to perform a fast estimation of the object size in
8319 * memory, and combine it with aging informations.
8320 *
8321 * Basically swappability = idle-time * log(estimated size)
8322 *
8323 * Bigger objects are preferred over smaller objects, but not
8324 * proportionally, this is why we use the logarithm. This algorithm is
8325 * just a first try and will probably be tuned later. */
8326 static double computeObjectSwappability(robj *o) {
8327 time_t age = server.unixtime - o->vm.atime;
8328 long asize = 0;
8329 list *l;
8330 dict *d;
8331 struct dictEntry *de;
8332 int z;
8333
8334 if (age <= 0) return 0;
8335 switch(o->type) {
8336 case REDIS_STRING:
8337 if (o->encoding != REDIS_ENCODING_RAW) {
8338 asize = sizeof(*o);
8339 } else {
8340 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8341 }
8342 break;
8343 case REDIS_LIST:
8344 l = o->ptr;
8345 listNode *ln = listFirst(l);
8346
8347 asize = sizeof(list);
8348 if (ln) {
8349 robj *ele = ln->value;
8350 long elesize;
8351
8352 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8353 (sizeof(*o)+sdslen(ele->ptr)) :
8354 sizeof(*o);
8355 asize += (sizeof(listNode)+elesize)*listLength(l);
8356 }
8357 break;
8358 case REDIS_SET:
8359 case REDIS_ZSET:
8360 z = (o->type == REDIS_ZSET);
8361 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8362
8363 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8364 if (z) asize += sizeof(zset)-sizeof(dict);
8365 if (dictSize(d)) {
8366 long elesize;
8367 robj *ele;
8368
8369 de = dictGetRandomKey(d);
8370 ele = dictGetEntryKey(de);
8371 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8372 (sizeof(*o)+sdslen(ele->ptr)) :
8373 sizeof(*o);
8374 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8375 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8376 }
8377 break;
8378 case REDIS_HASH:
8379 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8380 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8381 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8382 unsigned int klen, vlen;
8383 unsigned char *key, *val;
8384
8385 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8386 klen = 0;
8387 vlen = 0;
8388 }
8389 asize = len*(klen+vlen+3);
8390 } else if (o->encoding == REDIS_ENCODING_HT) {
8391 d = o->ptr;
8392 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8393 if (dictSize(d)) {
8394 long elesize;
8395 robj *ele;
8396
8397 de = dictGetRandomKey(d);
8398 ele = dictGetEntryKey(de);
8399 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8400 (sizeof(*o)+sdslen(ele->ptr)) :
8401 sizeof(*o);
8402 ele = dictGetEntryVal(de);
8403 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8404 (sizeof(*o)+sdslen(ele->ptr)) :
8405 sizeof(*o);
8406 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8407 }
8408 }
8409 break;
8410 }
8411 return (double)age*log(1+asize);
8412 }
8413
8414 /* Try to swap an object that's a good candidate for swapping.
8415 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8416 * to swap any object at all.
8417 *
8418 * If 'usethreaded' is true, Redis will try to swap the object in background
8419 * using I/O threads. */
8420 static int vmSwapOneObject(int usethreads) {
8421 int j, i;
8422 struct dictEntry *best = NULL;
8423 double best_swappability = 0;
8424 redisDb *best_db = NULL;
8425 robj *key, *val;
8426
8427 for (j = 0; j < server.dbnum; j++) {
8428 redisDb *db = server.db+j;
8429 /* Why maxtries is set to 100?
8430 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8431 * are swappable objects */
8432 int maxtries = 100;
8433
8434 if (dictSize(db->dict) == 0) continue;
8435 for (i = 0; i < 5; i++) {
8436 dictEntry *de;
8437 double swappability;
8438
8439 if (maxtries) maxtries--;
8440 de = dictGetRandomKey(db->dict);
8441 key = dictGetEntryKey(de);
8442 val = dictGetEntryVal(de);
8443 /* Only swap objects that are currently in memory.
8444 *
8445 * Also don't swap shared objects if threaded VM is on, as we
8446 * try to ensure that the main thread does not touch the
8447 * object while the I/O thread is using it, but we can't
8448 * control other keys without adding additional mutex. */
8449 if (key->storage != REDIS_VM_MEMORY ||
8450 (server.vm_max_threads != 0 && val->refcount != 1)) {
8451 if (maxtries) i--; /* don't count this try */
8452 continue;
8453 }
8454 swappability = computeObjectSwappability(val);
8455 if (!best || swappability > best_swappability) {
8456 best = de;
8457 best_swappability = swappability;
8458 best_db = db;
8459 }
8460 }
8461 }
8462 if (best == NULL) return REDIS_ERR;
8463 key = dictGetEntryKey(best);
8464 val = dictGetEntryVal(best);
8465
8466 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8467 key->ptr, best_swappability);
8468
8469 /* Unshare the key if needed */
8470 if (key->refcount > 1) {
8471 robj *newkey = dupStringObject(key);
8472 decrRefCount(key);
8473 key = dictGetEntryKey(best) = newkey;
8474 }
8475 /* Swap it */
8476 if (usethreads) {
8477 vmSwapObjectThreaded(key,val,best_db);
8478 return REDIS_OK;
8479 } else {
8480 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8481 dictGetEntryVal(best) = NULL;
8482 return REDIS_OK;
8483 } else {
8484 return REDIS_ERR;
8485 }
8486 }
8487 }
8488
8489 static int vmSwapOneObjectBlocking() {
8490 return vmSwapOneObject(0);
8491 }
8492
8493 static int vmSwapOneObjectThreaded() {
8494 return vmSwapOneObject(1);
8495 }
8496
8497 /* Return true if it's safe to swap out objects in a given moment.
8498 * Basically we don't want to swap objects out while there is a BGSAVE
8499 * or a BGAEOREWRITE running in backgroud. */
8500 static int vmCanSwapOut(void) {
8501 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8502 }
8503
8504 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8505 * and was deleted. Otherwise 0 is returned. */
8506 static int deleteIfSwapped(redisDb *db, robj *key) {
8507 dictEntry *de;
8508 robj *foundkey;
8509
8510 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8511 foundkey = dictGetEntryKey(de);
8512 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8513 deleteKey(db,key);
8514 return 1;
8515 }
8516
8517 /* =================== Virtual Memory - Threaded I/O ======================= */
8518
8519 static void freeIOJob(iojob *j) {
8520 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8521 j->type == REDIS_IOJOB_DO_SWAP ||
8522 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8523 decrRefCount(j->val);
8524 decrRefCount(j->key);
8525 zfree(j);
8526 }
8527
8528 /* Every time a thread finished a Job, it writes a byte into the write side
8529 * of an unix pipe in order to "awake" the main thread, and this function
8530 * is called. */
8531 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8532 int mask)
8533 {
8534 char buf[1];
8535 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8536 REDIS_NOTUSED(el);
8537 REDIS_NOTUSED(mask);
8538 REDIS_NOTUSED(privdata);
8539
8540 /* For every byte we read in the read side of the pipe, there is one
8541 * I/O job completed to process. */
8542 while((retval = read(fd,buf,1)) == 1) {
8543 iojob *j;
8544 listNode *ln;
8545 robj *key;
8546 struct dictEntry *de;
8547
8548 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8549
8550 /* Get the processed element (the oldest one) */
8551 lockThreadedIO();
8552 assert(listLength(server.io_processed) != 0);
8553 if (toprocess == -1) {
8554 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8555 if (toprocess <= 0) toprocess = 1;
8556 }
8557 ln = listFirst(server.io_processed);
8558 j = ln->value;
8559 listDelNode(server.io_processed,ln);
8560 unlockThreadedIO();
8561 /* If this job is marked as canceled, just ignore it */
8562 if (j->canceled) {
8563 freeIOJob(j);
8564 continue;
8565 }
8566 /* Post process it in the main thread, as there are things we
8567 * can do just here to avoid race conditions and/or invasive locks */
8568 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8569 de = dictFind(j->db->dict,j->key);
8570 assert(de != NULL);
8571 key = dictGetEntryKey(de);
8572 if (j->type == REDIS_IOJOB_LOAD) {
8573 redisDb *db;
8574
8575 /* Key loaded, bring it at home */
8576 key->storage = REDIS_VM_MEMORY;
8577 key->vm.atime = server.unixtime;
8578 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8579 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8580 (unsigned char*) key->ptr);
8581 server.vm_stats_swapped_objects--;
8582 server.vm_stats_swapins++;
8583 dictGetEntryVal(de) = j->val;
8584 incrRefCount(j->val);
8585 db = j->db;
8586 freeIOJob(j);
8587 /* Handle clients waiting for this key to be loaded. */
8588 handleClientsBlockedOnSwappedKey(db,key);
8589 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8590 /* Now we know the amount of pages required to swap this object.
8591 * Let's find some space for it, and queue this task again
8592 * rebranded as REDIS_IOJOB_DO_SWAP. */
8593 if (!vmCanSwapOut() ||
8594 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8595 {
8596 /* Ooops... no space or we can't swap as there is
8597 * a fork()ed Redis trying to save stuff on disk. */
8598 freeIOJob(j);
8599 key->storage = REDIS_VM_MEMORY; /* undo operation */
8600 } else {
8601 /* Note that we need to mark this pages as used now,
8602 * if the job will be canceled, we'll mark them as freed
8603 * again. */
8604 vmMarkPagesUsed(j->page,j->pages);
8605 j->type = REDIS_IOJOB_DO_SWAP;
8606 lockThreadedIO();
8607 queueIOJob(j);
8608 unlockThreadedIO();
8609 }
8610 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8611 robj *val;
8612
8613 /* Key swapped. We can finally free some memory. */
8614 if (key->storage != REDIS_VM_SWAPPING) {
8615 printf("key->storage: %d\n",key->storage);
8616 printf("key->name: %s\n",(char*)key->ptr);
8617 printf("key->refcount: %d\n",key->refcount);
8618 printf("val: %p\n",(void*)j->val);
8619 printf("val->type: %d\n",j->val->type);
8620 printf("val->ptr: %s\n",(char*)j->val->ptr);
8621 }
8622 redisAssert(key->storage == REDIS_VM_SWAPPING);
8623 val = dictGetEntryVal(de);
8624 key->vm.page = j->page;
8625 key->vm.usedpages = j->pages;
8626 key->storage = REDIS_VM_SWAPPED;
8627 key->vtype = j->val->type;
8628 decrRefCount(val); /* Deallocate the object from memory. */
8629 dictGetEntryVal(de) = NULL;
8630 redisLog(REDIS_DEBUG,
8631 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8632 (unsigned char*) key->ptr,
8633 (unsigned long long) j->page, (unsigned long long) j->pages);
8634 server.vm_stats_swapped_objects++;
8635 server.vm_stats_swapouts++;
8636 freeIOJob(j);
8637 /* Put a few more swap requests in queue if we are still
8638 * out of memory */
8639 if (trytoswap && vmCanSwapOut() &&
8640 zmalloc_used_memory() > server.vm_max_memory)
8641 {
8642 int more = 1;
8643 while(more) {
8644 lockThreadedIO();
8645 more = listLength(server.io_newjobs) <
8646 (unsigned) server.vm_max_threads;
8647 unlockThreadedIO();
8648 /* Don't waste CPU time if swappable objects are rare. */
8649 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8650 trytoswap = 0;
8651 break;
8652 }
8653 }
8654 }
8655 }
8656 processed++;
8657 if (processed == toprocess) return;
8658 }
8659 if (retval < 0 && errno != EAGAIN) {
8660 redisLog(REDIS_WARNING,
8661 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8662 strerror(errno));
8663 }
8664 }
8665
8666 static void lockThreadedIO(void) {
8667 pthread_mutex_lock(&server.io_mutex);
8668 }
8669
8670 static void unlockThreadedIO(void) {
8671 pthread_mutex_unlock(&server.io_mutex);
8672 }
8673
8674 /* Remove the specified object from the threaded I/O queue if still not
8675 * processed, otherwise make sure to flag it as canceled. */
8676 static void vmCancelThreadedIOJob(robj *o) {
8677 list *lists[3] = {
8678 server.io_newjobs, /* 0 */
8679 server.io_processing, /* 1 */
8680 server.io_processed /* 2 */
8681 };
8682 int i;
8683
8684 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8685 again:
8686 lockThreadedIO();
8687 /* Search for a matching key in one of the queues */
8688 for (i = 0; i < 3; i++) {
8689 listNode *ln;
8690 listIter li;
8691
8692 listRewind(lists[i],&li);
8693 while ((ln = listNext(&li)) != NULL) {
8694 iojob *job = ln->value;
8695
8696 if (job->canceled) continue; /* Skip this, already canceled. */
8697 if (compareStringObjects(job->key,o) == 0) {
8698 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8699 (void*)job, (char*)o->ptr, job->type, i);
8700 /* Mark the pages as free since the swap didn't happened
8701 * or happened but is now discarded. */
8702 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8703 vmMarkPagesFree(job->page,job->pages);
8704 /* Cancel the job. It depends on the list the job is
8705 * living in. */
8706 switch(i) {
8707 case 0: /* io_newjobs */
8708 /* If the job was yet not processed the best thing to do
8709 * is to remove it from the queue at all */
8710 freeIOJob(job);
8711 listDelNode(lists[i],ln);
8712 break;
8713 case 1: /* io_processing */
8714 /* Oh Shi- the thread is messing with the Job:
8715 *
8716 * Probably it's accessing the object if this is a
8717 * PREPARE_SWAP or DO_SWAP job.
8718 * If it's a LOAD job it may be reading from disk and
8719 * if we don't wait for the job to terminate before to
8720 * cancel it, maybe in a few microseconds data can be
8721 * corrupted in this pages. So the short story is:
8722 *
8723 * Better to wait for the job to move into the
8724 * next queue (processed)... */
8725
8726 /* We try again and again until the job is completed. */
8727 unlockThreadedIO();
8728 /* But let's wait some time for the I/O thread
8729 * to finish with this job. After all this condition
8730 * should be very rare. */
8731 usleep(1);
8732 goto again;
8733 case 2: /* io_processed */
8734 /* The job was already processed, that's easy...
8735 * just mark it as canceled so that we'll ignore it
8736 * when processing completed jobs. */
8737 job->canceled = 1;
8738 break;
8739 }
8740 /* Finally we have to adjust the storage type of the object
8741 * in order to "UNDO" the operaiton. */
8742 if (o->storage == REDIS_VM_LOADING)
8743 o->storage = REDIS_VM_SWAPPED;
8744 else if (o->storage == REDIS_VM_SWAPPING)
8745 o->storage = REDIS_VM_MEMORY;
8746 unlockThreadedIO();
8747 return;
8748 }
8749 }
8750 }
8751 unlockThreadedIO();
8752 assert(1 != 1); /* We should never reach this */
8753 }
8754
8755 static void *IOThreadEntryPoint(void *arg) {
8756 iojob *j;
8757 listNode *ln;
8758 REDIS_NOTUSED(arg);
8759
8760 pthread_detach(pthread_self());
8761 while(1) {
8762 /* Get a new job to process */
8763 lockThreadedIO();
8764 if (listLength(server.io_newjobs) == 0) {
8765 /* No new jobs in queue, exit. */
8766 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8767 (long) pthread_self());
8768 server.io_active_threads--;
8769 unlockThreadedIO();
8770 return NULL;
8771 }
8772 ln = listFirst(server.io_newjobs);
8773 j = ln->value;
8774 listDelNode(server.io_newjobs,ln);
8775 /* Add the job in the processing queue */
8776 j->thread = pthread_self();
8777 listAddNodeTail(server.io_processing,j);
8778 ln = listLast(server.io_processing); /* We use ln later to remove it */
8779 unlockThreadedIO();
8780 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8781 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8782
8783 /* Process the Job */
8784 if (j->type == REDIS_IOJOB_LOAD) {
8785 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8786 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8787 FILE *fp = fopen("/dev/null","w+");
8788 j->pages = rdbSavedObjectPages(j->val,fp);
8789 fclose(fp);
8790 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8791 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8792 j->canceled = 1;
8793 }
8794
8795 /* Done: insert the job into the processed queue */
8796 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8797 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8798 lockThreadedIO();
8799 listDelNode(server.io_processing,ln);
8800 listAddNodeTail(server.io_processed,j);
8801 unlockThreadedIO();
8802
8803 /* Signal the main thread there is new stuff to process */
8804 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8805 }
8806 return NULL; /* never reached */
8807 }
8808
8809 static void spawnIOThread(void) {
8810 pthread_t thread;
8811 sigset_t mask, omask;
8812 int err;
8813
8814 sigemptyset(&mask);
8815 sigaddset(&mask,SIGCHLD);
8816 sigaddset(&mask,SIGHUP);
8817 sigaddset(&mask,SIGPIPE);
8818 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8819 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8820 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8821 strerror(err));
8822 usleep(1000000);
8823 }
8824 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8825 server.io_active_threads++;
8826 }
8827
8828 /* We need to wait for the last thread to exit before we are able to
8829 * fork() in order to BGSAVE or BGREWRITEAOF. */
8830 static void waitEmptyIOJobsQueue(void) {
8831 while(1) {
8832 int io_processed_len;
8833
8834 lockThreadedIO();
8835 if (listLength(server.io_newjobs) == 0 &&
8836 listLength(server.io_processing) == 0 &&
8837 server.io_active_threads == 0)
8838 {
8839 unlockThreadedIO();
8840 return;
8841 }
8842 /* While waiting for empty jobs queue condition we post-process some
8843 * finshed job, as I/O threads may be hanging trying to write against
8844 * the io_ready_pipe_write FD but there are so much pending jobs that
8845 * it's blocking. */
8846 io_processed_len = listLength(server.io_processed);
8847 unlockThreadedIO();
8848 if (io_processed_len) {
8849 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8850 usleep(1000); /* 1 millisecond */
8851 } else {
8852 usleep(10000); /* 10 milliseconds */
8853 }
8854 }
8855 }
8856
8857 static void vmReopenSwapFile(void) {
8858 /* Note: we don't close the old one as we are in the child process
8859 * and don't want to mess at all with the original file object. */
8860 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8861 if (server.vm_fp == NULL) {
8862 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8863 server.vm_swap_file);
8864 _exit(1);
8865 }
8866 server.vm_fd = fileno(server.vm_fp);
8867 }
8868
8869 /* This function must be called while with threaded IO locked */
8870 static void queueIOJob(iojob *j) {
8871 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8872 (void*)j, j->type, (char*)j->key->ptr);
8873 listAddNodeTail(server.io_newjobs,j);
8874 if (server.io_active_threads < server.vm_max_threads)
8875 spawnIOThread();
8876 }
8877
8878 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8879 iojob *j;
8880
8881 assert(key->storage == REDIS_VM_MEMORY);
8882 assert(key->refcount == 1);
8883
8884 j = zmalloc(sizeof(*j));
8885 j->type = REDIS_IOJOB_PREPARE_SWAP;
8886 j->db = db;
8887 j->key = dupStringObject(key);
8888 j->val = val;
8889 incrRefCount(val);
8890 j->canceled = 0;
8891 j->thread = (pthread_t) -1;
8892 key->storage = REDIS_VM_SWAPPING;
8893
8894 lockThreadedIO();
8895 queueIOJob(j);
8896 unlockThreadedIO();
8897 return REDIS_OK;
8898 }
8899
8900 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8901
8902 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8903 * If there is not already a job loading the key, it is craeted.
8904 * The key is added to the io_keys list in the client structure, and also
8905 * in the hash table mapping swapped keys to waiting clients, that is,
8906 * server.io_waited_keys. */
8907 static int waitForSwappedKey(redisClient *c, robj *key) {
8908 struct dictEntry *de;
8909 robj *o;
8910 list *l;
8911
8912 /* If the key does not exist or is already in RAM we don't need to
8913 * block the client at all. */
8914 de = dictFind(c->db->dict,key);
8915 if (de == NULL) return 0;
8916 o = dictGetEntryKey(de);
8917 if (o->storage == REDIS_VM_MEMORY) {
8918 return 0;
8919 } else if (o->storage == REDIS_VM_SWAPPING) {
8920 /* We were swapping the key, undo it! */
8921 vmCancelThreadedIOJob(o);
8922 return 0;
8923 }
8924
8925 /* OK: the key is either swapped, or being loaded just now. */
8926
8927 /* Add the key to the list of keys this client is waiting for.
8928 * This maps clients to keys they are waiting for. */
8929 listAddNodeTail(c->io_keys,key);
8930 incrRefCount(key);
8931
8932 /* Add the client to the swapped keys => clients waiting map. */
8933 de = dictFind(c->db->io_keys,key);
8934 if (de == NULL) {
8935 int retval;
8936
8937 /* For every key we take a list of clients blocked for it */
8938 l = listCreate();
8939 retval = dictAdd(c->db->io_keys,key,l);
8940 incrRefCount(key);
8941 assert(retval == DICT_OK);
8942 } else {
8943 l = dictGetEntryVal(de);
8944 }
8945 listAddNodeTail(l,c);
8946
8947 /* Are we already loading the key from disk? If not create a job */
8948 if (o->storage == REDIS_VM_SWAPPED) {
8949 iojob *j;
8950
8951 o->storage = REDIS_VM_LOADING;
8952 j = zmalloc(sizeof(*j));
8953 j->type = REDIS_IOJOB_LOAD;
8954 j->db = c->db;
8955 j->key = dupStringObject(key);
8956 j->key->vtype = o->vtype;
8957 j->page = o->vm.page;
8958 j->val = NULL;
8959 j->canceled = 0;
8960 j->thread = (pthread_t) -1;
8961 lockThreadedIO();
8962 queueIOJob(j);
8963 unlockThreadedIO();
8964 }
8965 return 1;
8966 }
8967
8968 /* Preload keys needed for the ZUNION and ZINTER commands. */
8969 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
8970 int i, num;
8971 num = atoi(c->argv[2]->ptr);
8972 for (i = 0; i < num; i++) {
8973 waitForSwappedKey(c,c->argv[3+i]);
8974 }
8975 }
8976
8977 /* Is this client attempting to run a command against swapped keys?
8978 * If so, block it ASAP, load the keys in background, then resume it.
8979 *
8980 * The important idea about this function is that it can fail! If keys will
8981 * still be swapped when the client is resumed, this key lookups will
8982 * just block loading keys from disk. In practical terms this should only
8983 * happen with SORT BY command or if there is a bug in this function.
8984 *
8985 * Return 1 if the client is marked as blocked, 0 if the client can
8986 * continue as the keys it is going to access appear to be in memory. */
8987 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8988 int j, last;
8989
8990 if (cmd->vm_preload_proc != NULL) {
8991 cmd->vm_preload_proc(c);
8992 } else {
8993 if (cmd->vm_firstkey == 0) return 0;
8994 last = cmd->vm_lastkey;
8995 if (last < 0) last = c->argc+last;
8996 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8997 waitForSwappedKey(c,c->argv[j]);
8998 }
8999
9000 /* If the client was blocked for at least one key, mark it as blocked. */
9001 if (listLength(c->io_keys)) {
9002 c->flags |= REDIS_IO_WAIT;
9003 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9004 server.vm_blocked_clients++;
9005 return 1;
9006 } else {
9007 return 0;
9008 }
9009 }
9010
9011 /* Remove the 'key' from the list of blocked keys for a given client.
9012 *
9013 * The function returns 1 when there are no longer blocking keys after
9014 * the current one was removed (and the client can be unblocked). */
9015 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9016 list *l;
9017 listNode *ln;
9018 listIter li;
9019 struct dictEntry *de;
9020
9021 /* Remove the key from the list of keys this client is waiting for. */
9022 listRewind(c->io_keys,&li);
9023 while ((ln = listNext(&li)) != NULL) {
9024 if (compareStringObjects(ln->value,key) == 0) {
9025 listDelNode(c->io_keys,ln);
9026 break;
9027 }
9028 }
9029 assert(ln != NULL);
9030
9031 /* Remove the client form the key => waiting clients map. */
9032 de = dictFind(c->db->io_keys,key);
9033 assert(de != NULL);
9034 l = dictGetEntryVal(de);
9035 ln = listSearchKey(l,c);
9036 assert(ln != NULL);
9037 listDelNode(l,ln);
9038 if (listLength(l) == 0)
9039 dictDelete(c->db->io_keys,key);
9040
9041 return listLength(c->io_keys) == 0;
9042 }
9043
9044 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9045 struct dictEntry *de;
9046 list *l;
9047 listNode *ln;
9048 int len;
9049
9050 de = dictFind(db->io_keys,key);
9051 if (!de) return;
9052
9053 l = dictGetEntryVal(de);
9054 len = listLength(l);
9055 /* Note: we can't use something like while(listLength(l)) as the list
9056 * can be freed by the calling function when we remove the last element. */
9057 while (len--) {
9058 ln = listFirst(l);
9059 redisClient *c = ln->value;
9060
9061 if (dontWaitForSwappedKey(c,key)) {
9062 /* Put the client in the list of clients ready to go as we
9063 * loaded all the keys about it. */
9064 listAddNodeTail(server.io_ready_clients,c);
9065 }
9066 }
9067 }
9068
9069 /* =========================== Remote Configuration ========================= */
9070
9071 static void configSetCommand(redisClient *c) {
9072 robj *o = getDecodedObject(c->argv[3]);
9073 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9074 zfree(server.dbfilename);
9075 server.dbfilename = zstrdup(o->ptr);
9076 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9077 zfree(server.requirepass);
9078 server.requirepass = zstrdup(o->ptr);
9079 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9080 zfree(server.masterauth);
9081 server.masterauth = zstrdup(o->ptr);
9082 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9083 server.maxmemory = strtoll(o->ptr, NULL, 10);
9084 } else {
9085 addReplySds(c,sdscatprintf(sdsempty(),
9086 "-ERR not supported CONFIG parameter %s\r\n",
9087 (char*)c->argv[2]->ptr));
9088 decrRefCount(o);
9089 return;
9090 }
9091 decrRefCount(o);
9092 addReply(c,shared.ok);
9093 }
9094
9095 static void configGetCommand(redisClient *c) {
9096 robj *o = getDecodedObject(c->argv[2]);
9097 robj *lenobj = createObject(REDIS_STRING,NULL);
9098 char *pattern = o->ptr;
9099 int matches = 0;
9100
9101 addReply(c,lenobj);
9102 decrRefCount(lenobj);
9103
9104 if (stringmatch(pattern,"dbfilename",0)) {
9105 addReplyBulkCString(c,"dbfilename");
9106 addReplyBulkCString(c,server.dbfilename);
9107 matches++;
9108 }
9109 if (stringmatch(pattern,"requirepass",0)) {
9110 addReplyBulkCString(c,"requirepass");
9111 addReplyBulkCString(c,server.requirepass);
9112 matches++;
9113 }
9114 if (stringmatch(pattern,"masterauth",0)) {
9115 addReplyBulkCString(c,"masterauth");
9116 addReplyBulkCString(c,server.masterauth);
9117 matches++;
9118 }
9119 if (stringmatch(pattern,"maxmemory",0)) {
9120 char buf[128];
9121
9122 snprintf(buf,128,"%llu\n",server.maxmemory);
9123 addReplyBulkCString(c,"maxmemory");
9124 addReplyBulkCString(c,buf);
9125 matches++;
9126 }
9127 decrRefCount(o);
9128 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9129 }
9130
9131 static void configCommand(redisClient *c) {
9132 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9133 if (c->argc != 4) goto badarity;
9134 configSetCommand(c);
9135 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9136 if (c->argc != 3) goto badarity;
9137 configGetCommand(c);
9138 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9139 if (c->argc != 2) goto badarity;
9140 server.stat_numcommands = 0;
9141 server.stat_numconnections = 0;
9142 server.stat_expiredkeys = 0;
9143 server.stat_starttime = time(NULL);
9144 addReply(c,shared.ok);
9145 } else {
9146 addReplySds(c,sdscatprintf(sdsempty(),
9147 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9148 }
9149 return;
9150
9151 badarity:
9152 addReplySds(c,sdscatprintf(sdsempty(),
9153 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9154 (char*) c->argv[1]->ptr));
9155 }
9156
9157 /* ================================= Debugging ============================== */
9158
9159 static void debugCommand(redisClient *c) {
9160 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9161 *((char*)-1) = 'x';
9162 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9163 if (rdbSave(server.dbfilename) != REDIS_OK) {
9164 addReply(c,shared.err);
9165 return;
9166 }
9167 emptyDb();
9168 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9169 addReply(c,shared.err);
9170 return;
9171 }
9172 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9173 addReply(c,shared.ok);
9174 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9175 emptyDb();
9176 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9177 addReply(c,shared.err);
9178 return;
9179 }
9180 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9181 addReply(c,shared.ok);
9182 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9183 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9184 robj *key, *val;
9185
9186 if (!de) {
9187 addReply(c,shared.nokeyerr);
9188 return;
9189 }
9190 key = dictGetEntryKey(de);
9191 val = dictGetEntryVal(de);
9192 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9193 key->storage == REDIS_VM_SWAPPING)) {
9194 char *strenc;
9195 char buf[128];
9196
9197 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9198 strenc = strencoding[val->encoding];
9199 } else {
9200 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9201 strenc = buf;
9202 }
9203 addReplySds(c,sdscatprintf(sdsempty(),
9204 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9205 "encoding:%s serializedlength:%lld\r\n",
9206 (void*)key, key->refcount, (void*)val, val->refcount,
9207 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9208 } else {
9209 addReplySds(c,sdscatprintf(sdsempty(),
9210 "+Key at:%p refcount:%d, value swapped at: page %llu "
9211 "using %llu pages\r\n",
9212 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9213 (unsigned long long) key->vm.usedpages));
9214 }
9215 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9216 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9217 robj *key, *val;
9218
9219 if (!server.vm_enabled) {
9220 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9221 return;
9222 }
9223 if (!de) {
9224 addReply(c,shared.nokeyerr);
9225 return;
9226 }
9227 key = dictGetEntryKey(de);
9228 val = dictGetEntryVal(de);
9229 /* If the key is shared we want to create a copy */
9230 if (key->refcount > 1) {
9231 robj *newkey = dupStringObject(key);
9232 decrRefCount(key);
9233 key = dictGetEntryKey(de) = newkey;
9234 }
9235 /* Swap it */
9236 if (key->storage != REDIS_VM_MEMORY) {
9237 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9238 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9239 dictGetEntryVal(de) = NULL;
9240 addReply(c,shared.ok);
9241 } else {
9242 addReply(c,shared.err);
9243 }
9244 } else {
9245 addReplySds(c,sdsnew(
9246 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
9247 }
9248 }
9249
9250 static void _redisAssert(char *estr, char *file, int line) {
9251 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9252 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9253 #ifdef HAVE_BACKTRACE
9254 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9255 *((char*)-1) = 'x';
9256 #endif
9257 }
9258
9259 /* =================================== Main! ================================ */
9260
9261 #ifdef __linux__
9262 int linuxOvercommitMemoryValue(void) {
9263 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9264 char buf[64];
9265
9266 if (!fp) return -1;
9267 if (fgets(buf,64,fp) == NULL) {
9268 fclose(fp);
9269 return -1;
9270 }
9271 fclose(fp);
9272
9273 return atoi(buf);
9274 }
9275
9276 void linuxOvercommitMemoryWarning(void) {
9277 if (linuxOvercommitMemoryValue() == 0) {
9278 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9279 }
9280 }
9281 #endif /* __linux__ */
9282
9283 static void daemonize(void) {
9284 int fd;
9285 FILE *fp;
9286
9287 if (fork() != 0) exit(0); /* parent exits */
9288 setsid(); /* create a new session */
9289
9290 /* Every output goes to /dev/null. If Redis is daemonized but
9291 * the 'logfile' is set to 'stdout' in the configuration file
9292 * it will not log at all. */
9293 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9294 dup2(fd, STDIN_FILENO);
9295 dup2(fd, STDOUT_FILENO);
9296 dup2(fd, STDERR_FILENO);
9297 if (fd > STDERR_FILENO) close(fd);
9298 }
9299 /* Try to write the pid file */
9300 fp = fopen(server.pidfile,"w");
9301 if (fp) {
9302 fprintf(fp,"%d\n",getpid());
9303 fclose(fp);
9304 }
9305 }
9306
9307 static void version() {
9308 printf("Redis server version %s\n", REDIS_VERSION);
9309 exit(0);
9310 }
9311
9312 static void usage() {
9313 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9314 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9315 exit(1);
9316 }
9317
9318 int main(int argc, char **argv) {
9319 time_t start;
9320
9321 initServerConfig();
9322 if (argc == 2) {
9323 if (strcmp(argv[1], "-v") == 0 ||
9324 strcmp(argv[1], "--version") == 0) version();
9325 if (strcmp(argv[1], "--help") == 0) usage();
9326 resetServerSaveParams();
9327 loadServerConfig(argv[1]);
9328 } else if ((argc > 2)) {
9329 usage();
9330 } else {
9331 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9332 }
9333 if (server.daemonize) daemonize();
9334 initServer();
9335 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9336 #ifdef __linux__
9337 linuxOvercommitMemoryWarning();
9338 #endif
9339 start = time(NULL);
9340 if (server.appendonly) {
9341 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9342 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9343 } else {
9344 if (rdbLoad(server.dbfilename) == REDIS_OK)
9345 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9346 }
9347 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9348 aeSetBeforeSleepProc(server.el,beforeSleep);
9349 aeMain(server.el);
9350 aeDeleteEventLoop(server.el);
9351 return 0;
9352 }
9353
9354 /* ============================= Backtrace support ========================= */
9355
9356 #ifdef HAVE_BACKTRACE
9357 static char *findFuncName(void *pointer, unsigned long *offset);
9358
9359 static void *getMcontextEip(ucontext_t *uc) {
9360 #if defined(__FreeBSD__)
9361 return (void*) uc->uc_mcontext.mc_eip;
9362 #elif defined(__dietlibc__)
9363 return (void*) uc->uc_mcontext.eip;
9364 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9365 #if __x86_64__
9366 return (void*) uc->uc_mcontext->__ss.__rip;
9367 #else
9368 return (void*) uc->uc_mcontext->__ss.__eip;
9369 #endif
9370 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9371 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9372 return (void*) uc->uc_mcontext->__ss.__rip;
9373 #else
9374 return (void*) uc->uc_mcontext->__ss.__eip;
9375 #endif
9376 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9377 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9378 #elif defined(__ia64__) /* Linux IA64 */
9379 return (void*) uc->uc_mcontext.sc_ip;
9380 #else
9381 return NULL;
9382 #endif
9383 }
9384
9385 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9386 void *trace[100];
9387 char **messages = NULL;
9388 int i, trace_size = 0;
9389 unsigned long offset=0;
9390 ucontext_t *uc = (ucontext_t*) secret;
9391 sds infostring;
9392 REDIS_NOTUSED(info);
9393
9394 redisLog(REDIS_WARNING,
9395 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9396 infostring = genRedisInfoString();
9397 redisLog(REDIS_WARNING, "%s",infostring);
9398 /* It's not safe to sdsfree() the returned string under memory
9399 * corruption conditions. Let it leak as we are going to abort */
9400
9401 trace_size = backtrace(trace, 100);
9402 /* overwrite sigaction with caller's address */
9403 if (getMcontextEip(uc) != NULL) {
9404 trace[1] = getMcontextEip(uc);
9405 }
9406 messages = backtrace_symbols(trace, trace_size);
9407
9408 for (i=1; i<trace_size; ++i) {
9409 char *fn = findFuncName(trace[i], &offset), *p;
9410
9411 p = strchr(messages[i],'+');
9412 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9413 redisLog(REDIS_WARNING,"%s", messages[i]);
9414 } else {
9415 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9416 }
9417 }
9418 /* free(messages); Don't call free() with possibly corrupted memory. */
9419 _exit(0);
9420 }
9421
9422 static void setupSigSegvAction(void) {
9423 struct sigaction act;
9424
9425 sigemptyset (&act.sa_mask);
9426 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9427 * is used. Otherwise, sa_handler is used */
9428 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9429 act.sa_sigaction = segvHandler;
9430 sigaction (SIGSEGV, &act, NULL);
9431 sigaction (SIGBUS, &act, NULL);
9432 sigaction (SIGFPE, &act, NULL);
9433 sigaction (SIGILL, &act, NULL);
9434 sigaction (SIGBUS, &act, NULL);
9435 return;
9436 }
9437
9438 #include "staticsymbols.h"
9439 /* This function try to convert a pointer into a function name. It's used in
9440 * oreder to provide a backtrace under segmentation fault that's able to
9441 * display functions declared as static (otherwise the backtrace is useless). */
9442 static char *findFuncName(void *pointer, unsigned long *offset){
9443 int i, ret = -1;
9444 unsigned long off, minoff = 0;
9445
9446 /* Try to match against the Symbol with the smallest offset */
9447 for (i=0; symsTable[i].pointer; i++) {
9448 unsigned long lp = (unsigned long) pointer;
9449
9450 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9451 off=lp-symsTable[i].pointer;
9452 if (ret < 0 || off < minoff) {
9453 minoff=off;
9454 ret=i;
9455 }
9456 }
9457 }
9458 if (ret == -1) return NULL;
9459 *offset = minoff;
9460 return symsTable[ret].name;
9461 }
9462 #else /* HAVE_BACKTRACE */
9463 static void setupSigSegvAction(void) {
9464 }
9465 #endif /* HAVE_BACKTRACE */
9466
9467
9468
9469 /* The End */
9470
9471
9472