]> git.saurik.com Git - redis.git/blob - redis.c
Removed a useless if spotted by Pieter Noordhuis
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.7"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132 };
133
134 /* Object types only used for dumping to disk */
135 #define REDIS_EXPIRETIME 253
136 #define REDIS_SELECTDB 254
137 #define REDIS_EOF 255
138
139 /* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
149 *
150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
152 #define REDIS_RDB_6BITLEN 0
153 #define REDIS_RDB_14BITLEN 1
154 #define REDIS_RDB_32BITLEN 2
155 #define REDIS_RDB_ENCVAL 3
156 #define REDIS_RDB_LENERR UINT_MAX
157
158 /* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
164 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
165
166 /* Virtual memory object->where field. */
167 #define REDIS_VM_MEMORY 0 /* The object is on memory */
168 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
169 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
172 /* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174 #define REDIS_VM_MAX_NEAR_PAGES 65536
175 #define REDIS_VM_MAX_RANDOM_JUMP 4096
176 #define REDIS_VM_MAX_THREADS 32
177 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
178 /* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
182 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
183
184 /* Client flags */
185 #define REDIS_SLAVE 1 /* This client is a slave server */
186 #define REDIS_MASTER 2 /* This client is a master server */
187 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188 #define REDIS_MULTI 8 /* This client is in a MULTI context */
189 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
191
192 /* Slave replication state - slave side */
193 #define REDIS_REPL_NONE 0 /* No active replication */
194 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
195 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
197 /* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
206 /* List related stuff */
207 #define REDIS_HEAD 0
208 #define REDIS_TAIL 1
209
210 /* Sort operations */
211 #define REDIS_SORT_GET 0
212 #define REDIS_SORT_ASC 1
213 #define REDIS_SORT_DESC 2
214 #define REDIS_SORTKEY_MAX 1024
215
216 /* Log levels */
217 #define REDIS_DEBUG 0
218 #define REDIS_VERBOSE 1
219 #define REDIS_NOTICE 2
220 #define REDIS_WARNING 3
221
222 /* Anti-warning macro... */
223 #define REDIS_NOTUSED(V) ((void) V)
224
225 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
227
228 /* Append only defines */
229 #define APPENDFSYNC_NO 0
230 #define APPENDFSYNC_ALWAYS 1
231 #define APPENDFSYNC_EVERYSEC 2
232
233 /* Hashes related defaults */
234 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
237 /* We can print the stacktrace, so our assert is defined this way: */
238 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
239 static void _redisAssert(char *estr, char *file, int line);
240
241 /*================================= Data types ============================== */
242
243 /* A redis object, that is a type able to hold a string / list / set */
244
245 /* The VM object structure */
246 struct redisObjectVM {
247 off_t page; /* the page at witch the object is stored on disk */
248 off_t usedpages; /* number of pages used on disk */
249 time_t atime; /* Last access time */
250 } vm;
251
252 /* The actual Redis Object */
253 typedef struct redisObject {
254 void *ptr;
255 unsigned char type;
256 unsigned char encoding;
257 unsigned char storage; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
261 int refcount;
262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm;
267 } robj;
268
269 /* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273 #define initStaticStringObject(_var,_ptr) do { \
274 _var.refcount = 1; \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
277 _var.ptr = _ptr; \
278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
279 } while(0);
280
281 typedef struct redisDb {
282 dict *dict; /* The keyspace for this DB */
283 dict *expires; /* Timeout of keys with a timeout set */
284 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
285 dict *io_keys; /* Keys with clients waiting for VM I/O */
286 int id;
287 } redisDb;
288
289 /* Client MULTI/EXEC state */
290 typedef struct multiCmd {
291 robj **argv;
292 int argc;
293 struct redisCommand *cmd;
294 } multiCmd;
295
296 typedef struct multiState {
297 multiCmd *commands; /* Array of MULTI commands */
298 int count; /* Total number of MULTI commands */
299 } multiState;
300
301 /* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303 typedef struct redisClient {
304 int fd;
305 redisDb *db;
306 int dictid;
307 sds querybuf;
308 robj **argv, **mbargv;
309 int argc, mbargc;
310 int bulklen; /* bulk read len. -1 if not in bulk read mode */
311 int multibulk; /* multi bulk command format active */
312 list *reply;
313 int sentlen;
314 time_t lastinteraction; /* time of the last interaction, used for timeout */
315 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
316 int slaveseldb; /* slave selected db, if this client is a slave */
317 int authenticated; /* when requirepass is non-NULL */
318 int replstate; /* replication state if this is a slave */
319 int repldbfd; /* replication DB file descriptor */
320 long repldboff; /* replication DB file offset */
321 off_t repldbsize; /* replication DB file size */
322 multiState mstate; /* MULTI/EXEC state */
323 robj **blockingkeys; /* The key we are waiting to terminate a blocking
324 * operation such as BLPOP. Otherwise NULL. */
325 int blockingkeysnum; /* Number of blocking keys */
326 time_t blockingto; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
328 list *io_keys; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
330 } redisClient;
331
332 struct saveparam {
333 time_t seconds;
334 int changes;
335 };
336
337 /* Global server state structure */
338 struct redisServer {
339 int port;
340 int fd;
341 redisDb *db;
342 dict *sharingpool; /* Poll used for object sharing */
343 unsigned int sharingpoolsize;
344 long long dirty; /* changes to DB from the last save */
345 list *clients;
346 list *slaves, *monitors;
347 char neterr[ANET_ERR_LEN];
348 aeEventLoop *el;
349 int cronloops; /* number of times the cron function run */
350 list *objfreelist; /* A list of freed objects to avoid malloc() */
351 time_t lastsave; /* Unix time of last save succeeede */
352 /* Fields used only for stats */
353 time_t stat_starttime; /* server start time */
354 long long stat_numcommands; /* number of processed commands */
355 long long stat_numconnections; /* number of connections received */
356 long long stat_expiredkeys; /* number of expired keys */
357 /* Configuration */
358 int verbosity;
359 int glueoutputbuf;
360 int maxidletime;
361 int dbnum;
362 int daemonize;
363 int appendonly;
364 int appendfsync;
365 time_t lastfsync;
366 int appendfd;
367 int appendseldb;
368 char *pidfile;
369 pid_t bgsavechildpid;
370 pid_t bgrewritechildpid;
371 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
372 struct saveparam *saveparams;
373 int saveparamslen;
374 char *logfile;
375 char *bindaddr;
376 char *dbfilename;
377 char *appendfilename;
378 char *requirepass;
379 int shareobjects;
380 int rdbcompression;
381 /* Replication related */
382 int isslave;
383 char *masterauth;
384 char *masterhost;
385 int masterport;
386 redisClient *master; /* client that is master for this slave */
387 int replstate;
388 unsigned int maxclients;
389 unsigned long long maxmemory;
390 unsigned int blpop_blocked_clients;
391 unsigned int vm_blocked_clients;
392 /* Sort parameters - qsort_r() is only available under BSD so we
393 * have to take this state global, in order to pass it to sortCompare() */
394 int sort_desc;
395 int sort_alpha;
396 int sort_bypattern;
397 /* Virtual memory configuration */
398 int vm_enabled;
399 char *vm_swap_file;
400 off_t vm_page_size;
401 off_t vm_pages;
402 unsigned long long vm_max_memory;
403 /* Hashes config */
404 size_t hash_max_zipmap_entries;
405 size_t hash_max_zipmap_value;
406 /* Virtual memory state */
407 FILE *vm_fp;
408 int vm_fd;
409 off_t vm_next_page; /* Next probably empty page */
410 off_t vm_near_pages; /* Number of pages allocated sequentially */
411 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
412 time_t unixtime; /* Unix time sampled every second. */
413 /* Virtual memory I/O threads stuff */
414 /* An I/O thread process an element taken from the io_jobs queue and
415 * put the result of the operation in the io_done list. While the
416 * job is being processed, it's put on io_processing queue. */
417 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
418 list *io_processing; /* List of VM I/O jobs being processed */
419 list *io_processed; /* List of VM I/O jobs already processed */
420 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
421 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
422 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
423 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
424 pthread_attr_t io_threads_attr; /* attributes for threads creation */
425 int io_active_threads; /* Number of running I/O threads */
426 int vm_max_threads; /* Max number of I/O threads running at the same time */
427 /* Our main thread is blocked on the event loop, locking for sockets ready
428 * to be read or written, so when a threaded I/O operation is ready to be
429 * processed by the main thread, the I/O thread will use a unix pipe to
430 * awake the main thread. The followings are the two pipe FDs. */
431 int io_ready_pipe_read;
432 int io_ready_pipe_write;
433 /* Virtual memory stats */
434 unsigned long long vm_stats_used_pages;
435 unsigned long long vm_stats_swapped_objects;
436 unsigned long long vm_stats_swapouts;
437 unsigned long long vm_stats_swapins;
438 FILE *devnull;
439 };
440
441 typedef void redisCommandProc(redisClient *c);
442 struct redisCommand {
443 char *name;
444 redisCommandProc *proc;
445 int arity;
446 int flags;
447 /* Use a function to determine which keys need to be loaded
448 * in the background prior to executing this command. Takes precedence
449 * over vm_firstkey and others, ignored when NULL */
450 redisCommandProc *vm_preload_proc;
451 /* What keys should be loaded in background when calling this command? */
452 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
453 int vm_lastkey; /* THe last argument that's a key */
454 int vm_keystep; /* The step between first and last key */
455 };
456
457 struct redisFunctionSym {
458 char *name;
459 unsigned long pointer;
460 };
461
462 typedef struct _redisSortObject {
463 robj *obj;
464 union {
465 double score;
466 robj *cmpobj;
467 } u;
468 } redisSortObject;
469
470 typedef struct _redisSortOperation {
471 int type;
472 robj *pattern;
473 } redisSortOperation;
474
475 /* ZSETs use a specialized version of Skiplists */
476
477 typedef struct zskiplistNode {
478 struct zskiplistNode **forward;
479 struct zskiplistNode *backward;
480 unsigned int *span;
481 double score;
482 robj *obj;
483 } zskiplistNode;
484
485 typedef struct zskiplist {
486 struct zskiplistNode *header, *tail;
487 unsigned long length;
488 int level;
489 } zskiplist;
490
491 typedef struct zset {
492 dict *dict;
493 zskiplist *zsl;
494 } zset;
495
496 /* Our shared "common" objects */
497
498 struct sharedObjectsStruct {
499 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
500 *colon, *nullbulk, *nullmultibulk, *queued,
501 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
502 *outofrangeerr, *plus,
503 *select0, *select1, *select2, *select3, *select4,
504 *select5, *select6, *select7, *select8, *select9;
505 } shared;
506
507 /* Global vars that are actally used as constants. The following double
508 * values are used for double on-disk serialization, and are initialized
509 * at runtime to avoid strange compiler optimizations. */
510
511 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
512
513 /* VM threaded I/O request message */
514 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
515 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
516 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
517 typedef struct iojob {
518 int type; /* Request type, REDIS_IOJOB_* */
519 redisDb *db;/* Redis database */
520 robj *key; /* This I/O request is about swapping this key */
521 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
522 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
523 off_t page; /* Swap page where to read/write the object */
524 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
525 int canceled; /* True if this command was canceled by blocking side of VM */
526 pthread_t thread; /* ID of the thread processing this entry */
527 } iojob;
528
529 /*================================ Prototypes =============================== */
530
531 static void freeStringObject(robj *o);
532 static void freeListObject(robj *o);
533 static void freeSetObject(robj *o);
534 static void decrRefCount(void *o);
535 static robj *createObject(int type, void *ptr);
536 static void freeClient(redisClient *c);
537 static int rdbLoad(char *filename);
538 static void addReply(redisClient *c, robj *obj);
539 static void addReplySds(redisClient *c, sds s);
540 static void incrRefCount(robj *o);
541 static int rdbSaveBackground(char *filename);
542 static robj *createStringObject(char *ptr, size_t len);
543 static robj *dupStringObject(robj *o);
544 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
545 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
546 static int syncWithMaster(void);
547 static robj *tryObjectSharing(robj *o);
548 static int tryObjectEncoding(robj *o);
549 static robj *getDecodedObject(robj *o);
550 static int removeExpire(redisDb *db, robj *key);
551 static int expireIfNeeded(redisDb *db, robj *key);
552 static int deleteIfVolatile(redisDb *db, robj *key);
553 static int deleteIfSwapped(redisDb *db, robj *key);
554 static int deleteKey(redisDb *db, robj *key);
555 static time_t getExpire(redisDb *db, robj *key);
556 static int setExpire(redisDb *db, robj *key, time_t when);
557 static void updateSlavesWaitingBgsave(int bgsaveerr);
558 static void freeMemoryIfNeeded(void);
559 static int processCommand(redisClient *c);
560 static void setupSigSegvAction(void);
561 static void rdbRemoveTempFile(pid_t childpid);
562 static void aofRemoveTempFile(pid_t childpid);
563 static size_t stringObjectLen(robj *o);
564 static void processInputBuffer(redisClient *c);
565 static zskiplist *zslCreate(void);
566 static void zslFree(zskiplist *zsl);
567 static void zslInsert(zskiplist *zsl, double score, robj *obj);
568 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
569 static void initClientMultiState(redisClient *c);
570 static void freeClientMultiState(redisClient *c);
571 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
572 static void unblockClientWaitingData(redisClient *c);
573 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
574 static void vmInit(void);
575 static void vmMarkPagesFree(off_t page, off_t count);
576 static robj *vmLoadObject(robj *key);
577 static robj *vmPreviewObject(robj *key);
578 static int vmSwapOneObjectBlocking(void);
579 static int vmSwapOneObjectThreaded(void);
580 static int vmCanSwapOut(void);
581 static int tryFreeOneObjectFromFreelist(void);
582 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
583 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
584 static void vmCancelThreadedIOJob(robj *o);
585 static void lockThreadedIO(void);
586 static void unlockThreadedIO(void);
587 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
588 static void freeIOJob(iojob *j);
589 static void queueIOJob(iojob *j);
590 static int vmWriteObjectOnSwap(robj *o, off_t page);
591 static robj *vmReadObjectFromSwap(off_t page, int type);
592 static void waitEmptyIOJobsQueue(void);
593 static void vmReopenSwapFile(void);
594 static int vmFreePage(off_t page);
595 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
596 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
597 static int dontWaitForSwappedKey(redisClient *c, robj *key);
598 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
599 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
600 static struct redisCommand *lookupCommand(char *name);
601 static void call(redisClient *c, struct redisCommand *cmd);
602 static void resetClient(redisClient *c);
603 static void convertToRealHash(robj *o);
604
605 static void authCommand(redisClient *c);
606 static void pingCommand(redisClient *c);
607 static void echoCommand(redisClient *c);
608 static void setCommand(redisClient *c);
609 static void setnxCommand(redisClient *c);
610 static void getCommand(redisClient *c);
611 static void delCommand(redisClient *c);
612 static void existsCommand(redisClient *c);
613 static void incrCommand(redisClient *c);
614 static void decrCommand(redisClient *c);
615 static void incrbyCommand(redisClient *c);
616 static void decrbyCommand(redisClient *c);
617 static void selectCommand(redisClient *c);
618 static void randomkeyCommand(redisClient *c);
619 static void keysCommand(redisClient *c);
620 static void dbsizeCommand(redisClient *c);
621 static void lastsaveCommand(redisClient *c);
622 static void saveCommand(redisClient *c);
623 static void bgsaveCommand(redisClient *c);
624 static void bgrewriteaofCommand(redisClient *c);
625 static void shutdownCommand(redisClient *c);
626 static void moveCommand(redisClient *c);
627 static void renameCommand(redisClient *c);
628 static void renamenxCommand(redisClient *c);
629 static void lpushCommand(redisClient *c);
630 static void rpushCommand(redisClient *c);
631 static void lpopCommand(redisClient *c);
632 static void rpopCommand(redisClient *c);
633 static void llenCommand(redisClient *c);
634 static void lindexCommand(redisClient *c);
635 static void lrangeCommand(redisClient *c);
636 static void ltrimCommand(redisClient *c);
637 static void typeCommand(redisClient *c);
638 static void lsetCommand(redisClient *c);
639 static void saddCommand(redisClient *c);
640 static void sremCommand(redisClient *c);
641 static void smoveCommand(redisClient *c);
642 static void sismemberCommand(redisClient *c);
643 static void scardCommand(redisClient *c);
644 static void spopCommand(redisClient *c);
645 static void srandmemberCommand(redisClient *c);
646 static void sinterCommand(redisClient *c);
647 static void sinterstoreCommand(redisClient *c);
648 static void sunionCommand(redisClient *c);
649 static void sunionstoreCommand(redisClient *c);
650 static void sdiffCommand(redisClient *c);
651 static void sdiffstoreCommand(redisClient *c);
652 static void syncCommand(redisClient *c);
653 static void flushdbCommand(redisClient *c);
654 static void flushallCommand(redisClient *c);
655 static void sortCommand(redisClient *c);
656 static void lremCommand(redisClient *c);
657 static void rpoplpushcommand(redisClient *c);
658 static void infoCommand(redisClient *c);
659 static void mgetCommand(redisClient *c);
660 static void monitorCommand(redisClient *c);
661 static void expireCommand(redisClient *c);
662 static void expireatCommand(redisClient *c);
663 static void getsetCommand(redisClient *c);
664 static void ttlCommand(redisClient *c);
665 static void slaveofCommand(redisClient *c);
666 static void debugCommand(redisClient *c);
667 static void msetCommand(redisClient *c);
668 static void msetnxCommand(redisClient *c);
669 static void zaddCommand(redisClient *c);
670 static void zincrbyCommand(redisClient *c);
671 static void zrangeCommand(redisClient *c);
672 static void zrangebyscoreCommand(redisClient *c);
673 static void zcountCommand(redisClient *c);
674 static void zrevrangeCommand(redisClient *c);
675 static void zcardCommand(redisClient *c);
676 static void zremCommand(redisClient *c);
677 static void zscoreCommand(redisClient *c);
678 static void zremrangebyscoreCommand(redisClient *c);
679 static void multiCommand(redisClient *c);
680 static void execCommand(redisClient *c);
681 static void discardCommand(redisClient *c);
682 static void blpopCommand(redisClient *c);
683 static void brpopCommand(redisClient *c);
684 static void appendCommand(redisClient *c);
685 static void substrCommand(redisClient *c);
686 static void zrankCommand(redisClient *c);
687 static void zrevrankCommand(redisClient *c);
688 static void hsetCommand(redisClient *c);
689 static void hgetCommand(redisClient *c);
690 static void hdelCommand(redisClient *c);
691 static void hlenCommand(redisClient *c);
692 static void zremrangebyrankCommand(redisClient *c);
693 static void zunionCommand(redisClient *c);
694 static void zinterCommand(redisClient *c);
695 static void hkeysCommand(redisClient *c);
696 static void hvalsCommand(redisClient *c);
697 static void hgetallCommand(redisClient *c);
698 static void hexistsCommand(redisClient *c);
699 static void configCommand(redisClient *c);
700
701 /*================================= Globals ================================= */
702
703 /* Global vars */
704 static struct redisServer server; /* server global state */
705 static struct redisCommand cmdTable[] = {
706 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
707 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
708 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
709 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
710 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
711 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
712 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
713 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
714 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
715 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
716 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
717 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
718 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
719 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
720 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
721 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
722 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
723 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
724 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
725 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
726 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
727 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
728 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
729 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
730 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
731 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
732 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
733 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
734 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
735 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
737 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
738 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
739 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
740 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
741 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
742 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
745 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
746 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
749 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
750 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
758 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
759 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
760 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
761 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
766 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
770 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
771 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
772 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
773 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
777 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
778 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
779 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
780 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
781 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
782 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
783 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
784 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
785 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
786 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
787 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
788 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
790 {"exec",execCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
791 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
793 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
795 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
798 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
800 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
802 {NULL,NULL,0,0,NULL,0,0,0}
803 };
804
805 static void usage();
806
807 /*============================ Utility functions ============================ */
808
809 /* Glob-style pattern matching. */
810 static int stringmatchlen(const char *pattern, int patternLen,
811 const char *string, int stringLen, int nocase)
812 {
813 while(patternLen) {
814 switch(pattern[0]) {
815 case '*':
816 while (pattern[1] == '*') {
817 pattern++;
818 patternLen--;
819 }
820 if (patternLen == 1)
821 return 1; /* match */
822 while(stringLen) {
823 if (stringmatchlen(pattern+1, patternLen-1,
824 string, stringLen, nocase))
825 return 1; /* match */
826 string++;
827 stringLen--;
828 }
829 return 0; /* no match */
830 break;
831 case '?':
832 if (stringLen == 0)
833 return 0; /* no match */
834 string++;
835 stringLen--;
836 break;
837 case '[':
838 {
839 int not, match;
840
841 pattern++;
842 patternLen--;
843 not = pattern[0] == '^';
844 if (not) {
845 pattern++;
846 patternLen--;
847 }
848 match = 0;
849 while(1) {
850 if (pattern[0] == '\\') {
851 pattern++;
852 patternLen--;
853 if (pattern[0] == string[0])
854 match = 1;
855 } else if (pattern[0] == ']') {
856 break;
857 } else if (patternLen == 0) {
858 pattern--;
859 patternLen++;
860 break;
861 } else if (pattern[1] == '-' && patternLen >= 3) {
862 int start = pattern[0];
863 int end = pattern[2];
864 int c = string[0];
865 if (start > end) {
866 int t = start;
867 start = end;
868 end = t;
869 }
870 if (nocase) {
871 start = tolower(start);
872 end = tolower(end);
873 c = tolower(c);
874 }
875 pattern += 2;
876 patternLen -= 2;
877 if (c >= start && c <= end)
878 match = 1;
879 } else {
880 if (!nocase) {
881 if (pattern[0] == string[0])
882 match = 1;
883 } else {
884 if (tolower((int)pattern[0]) == tolower((int)string[0]))
885 match = 1;
886 }
887 }
888 pattern++;
889 patternLen--;
890 }
891 if (not)
892 match = !match;
893 if (!match)
894 return 0; /* no match */
895 string++;
896 stringLen--;
897 break;
898 }
899 case '\\':
900 if (patternLen >= 2) {
901 pattern++;
902 patternLen--;
903 }
904 /* fall through */
905 default:
906 if (!nocase) {
907 if (pattern[0] != string[0])
908 return 0; /* no match */
909 } else {
910 if (tolower((int)pattern[0]) != tolower((int)string[0]))
911 return 0; /* no match */
912 }
913 string++;
914 stringLen--;
915 break;
916 }
917 pattern++;
918 patternLen--;
919 if (stringLen == 0) {
920 while(*pattern == '*') {
921 pattern++;
922 patternLen--;
923 }
924 break;
925 }
926 }
927 if (patternLen == 0 && stringLen == 0)
928 return 1;
929 return 0;
930 }
931
932 static int stringmatch(const char *pattern, const char *string, int nocase) {
933 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
934 }
935
936 static void redisLog(int level, const char *fmt, ...) {
937 va_list ap;
938 FILE *fp;
939
940 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
941 if (!fp) return;
942
943 va_start(ap, fmt);
944 if (level >= server.verbosity) {
945 char *c = ".-*#";
946 char buf[64];
947 time_t now;
948
949 now = time(NULL);
950 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
951 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
952 vfprintf(fp, fmt, ap);
953 fprintf(fp,"\n");
954 fflush(fp);
955 }
956 va_end(ap);
957
958 if (server.logfile) fclose(fp);
959 }
960
961 /*====================== Hash table type implementation ==================== */
962
963 /* This is an hash table type that uses the SDS dynamic strings libary as
964 * keys and radis objects as values (objects can hold SDS strings,
965 * lists, sets). */
966
967 static void dictVanillaFree(void *privdata, void *val)
968 {
969 DICT_NOTUSED(privdata);
970 zfree(val);
971 }
972
973 static void dictListDestructor(void *privdata, void *val)
974 {
975 DICT_NOTUSED(privdata);
976 listRelease((list*)val);
977 }
978
979 static int sdsDictKeyCompare(void *privdata, const void *key1,
980 const void *key2)
981 {
982 int l1,l2;
983 DICT_NOTUSED(privdata);
984
985 l1 = sdslen((sds)key1);
986 l2 = sdslen((sds)key2);
987 if (l1 != l2) return 0;
988 return memcmp(key1, key2, l1) == 0;
989 }
990
991 static void dictRedisObjectDestructor(void *privdata, void *val)
992 {
993 DICT_NOTUSED(privdata);
994
995 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
996 decrRefCount(val);
997 }
998
999 static int dictObjKeyCompare(void *privdata, const void *key1,
1000 const void *key2)
1001 {
1002 const robj *o1 = key1, *o2 = key2;
1003 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1004 }
1005
1006 static unsigned int dictObjHash(const void *key) {
1007 const robj *o = key;
1008 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1009 }
1010
1011 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1012 const void *key2)
1013 {
1014 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1015 int cmp;
1016
1017 if (o1->encoding == REDIS_ENCODING_INT &&
1018 o2->encoding == REDIS_ENCODING_INT &&
1019 o1->ptr == o2->ptr) return 1;
1020
1021 o1 = getDecodedObject(o1);
1022 o2 = getDecodedObject(o2);
1023 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1024 decrRefCount(o1);
1025 decrRefCount(o2);
1026 return cmp;
1027 }
1028
1029 static unsigned int dictEncObjHash(const void *key) {
1030 robj *o = (robj*) key;
1031
1032 if (o->encoding == REDIS_ENCODING_RAW) {
1033 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1034 } else {
1035 if (o->encoding == REDIS_ENCODING_INT) {
1036 char buf[32];
1037 int len;
1038
1039 len = snprintf(buf,32,"%ld",(long)o->ptr);
1040 return dictGenHashFunction((unsigned char*)buf, len);
1041 } else {
1042 unsigned int hash;
1043
1044 o = getDecodedObject(o);
1045 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1046 decrRefCount(o);
1047 return hash;
1048 }
1049 }
1050 }
1051
1052 /* Sets type and expires */
1053 static dictType setDictType = {
1054 dictEncObjHash, /* hash function */
1055 NULL, /* key dup */
1056 NULL, /* val dup */
1057 dictEncObjKeyCompare, /* key compare */
1058 dictRedisObjectDestructor, /* key destructor */
1059 NULL /* val destructor */
1060 };
1061
1062 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1063 static dictType zsetDictType = {
1064 dictEncObjHash, /* hash function */
1065 NULL, /* key dup */
1066 NULL, /* val dup */
1067 dictEncObjKeyCompare, /* key compare */
1068 dictRedisObjectDestructor, /* key destructor */
1069 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1070 };
1071
1072 /* Db->dict */
1073 static dictType dbDictType = {
1074 dictObjHash, /* hash function */
1075 NULL, /* key dup */
1076 NULL, /* val dup */
1077 dictObjKeyCompare, /* key compare */
1078 dictRedisObjectDestructor, /* key destructor */
1079 dictRedisObjectDestructor /* val destructor */
1080 };
1081
1082 /* Db->expires */
1083 static dictType keyptrDictType = {
1084 dictObjHash, /* hash function */
1085 NULL, /* key dup */
1086 NULL, /* val dup */
1087 dictObjKeyCompare, /* key compare */
1088 dictRedisObjectDestructor, /* key destructor */
1089 NULL /* val destructor */
1090 };
1091
1092 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1093 static dictType hashDictType = {
1094 dictEncObjHash, /* hash function */
1095 NULL, /* key dup */
1096 NULL, /* val dup */
1097 dictEncObjKeyCompare, /* key compare */
1098 dictRedisObjectDestructor, /* key destructor */
1099 dictRedisObjectDestructor /* val destructor */
1100 };
1101
1102 /* Keylist hash table type has unencoded redis objects as keys and
1103 * lists as values. It's used for blocking operations (BLPOP) and to
1104 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1105 static dictType keylistDictType = {
1106 dictObjHash, /* hash function */
1107 NULL, /* key dup */
1108 NULL, /* val dup */
1109 dictObjKeyCompare, /* key compare */
1110 dictRedisObjectDestructor, /* key destructor */
1111 dictListDestructor /* val destructor */
1112 };
1113
1114 static void version();
1115
1116 /* ========================= Random utility functions ======================= */
1117
1118 /* Redis generally does not try to recover from out of memory conditions
1119 * when allocating objects or strings, it is not clear if it will be possible
1120 * to report this condition to the client since the networking layer itself
1121 * is based on heap allocation for send buffers, so we simply abort.
1122 * At least the code will be simpler to read... */
1123 static void oom(const char *msg) {
1124 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1125 sleep(1);
1126 abort();
1127 }
1128
1129 /* ====================== Redis server networking stuff ===================== */
1130 static void closeTimedoutClients(void) {
1131 redisClient *c;
1132 listNode *ln;
1133 time_t now = time(NULL);
1134 listIter li;
1135
1136 listRewind(server.clients,&li);
1137 while ((ln = listNext(&li)) != NULL) {
1138 c = listNodeValue(ln);
1139 if (server.maxidletime &&
1140 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1141 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1142 (now - c->lastinteraction > server.maxidletime))
1143 {
1144 redisLog(REDIS_VERBOSE,"Closing idle client");
1145 freeClient(c);
1146 } else if (c->flags & REDIS_BLOCKED) {
1147 if (c->blockingto != 0 && c->blockingto < now) {
1148 addReply(c,shared.nullmultibulk);
1149 unblockClientWaitingData(c);
1150 }
1151 }
1152 }
1153 }
1154
1155 static int htNeedsResize(dict *dict) {
1156 long long size, used;
1157
1158 size = dictSlots(dict);
1159 used = dictSize(dict);
1160 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1161 (used*100/size < REDIS_HT_MINFILL));
1162 }
1163
1164 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1165 * we resize the hash table to save memory */
1166 static void tryResizeHashTables(void) {
1167 int j;
1168
1169 for (j = 0; j < server.dbnum; j++) {
1170 if (htNeedsResize(server.db[j].dict)) {
1171 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1172 dictResize(server.db[j].dict);
1173 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1174 }
1175 if (htNeedsResize(server.db[j].expires))
1176 dictResize(server.db[j].expires);
1177 }
1178 }
1179
1180 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1181 void backgroundSaveDoneHandler(int statloc) {
1182 int exitcode = WEXITSTATUS(statloc);
1183 int bysignal = WIFSIGNALED(statloc);
1184
1185 if (!bysignal && exitcode == 0) {
1186 redisLog(REDIS_NOTICE,
1187 "Background saving terminated with success");
1188 server.dirty = 0;
1189 server.lastsave = time(NULL);
1190 } else if (!bysignal && exitcode != 0) {
1191 redisLog(REDIS_WARNING, "Background saving error");
1192 } else {
1193 redisLog(REDIS_WARNING,
1194 "Background saving terminated by signal");
1195 rdbRemoveTempFile(server.bgsavechildpid);
1196 }
1197 server.bgsavechildpid = -1;
1198 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1199 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1200 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1201 }
1202
1203 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1204 * Handle this. */
1205 void backgroundRewriteDoneHandler(int statloc) {
1206 int exitcode = WEXITSTATUS(statloc);
1207 int bysignal = WIFSIGNALED(statloc);
1208
1209 if (!bysignal && exitcode == 0) {
1210 int fd;
1211 char tmpfile[256];
1212
1213 redisLog(REDIS_NOTICE,
1214 "Background append only file rewriting terminated with success");
1215 /* Now it's time to flush the differences accumulated by the parent */
1216 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1217 fd = open(tmpfile,O_WRONLY|O_APPEND);
1218 if (fd == -1) {
1219 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1220 goto cleanup;
1221 }
1222 /* Flush our data... */
1223 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1224 (signed) sdslen(server.bgrewritebuf)) {
1225 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1226 close(fd);
1227 goto cleanup;
1228 }
1229 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1230 /* Now our work is to rename the temp file into the stable file. And
1231 * switch the file descriptor used by the server for append only. */
1232 if (rename(tmpfile,server.appendfilename) == -1) {
1233 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1234 close(fd);
1235 goto cleanup;
1236 }
1237 /* Mission completed... almost */
1238 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1239 if (server.appendfd != -1) {
1240 /* If append only is actually enabled... */
1241 close(server.appendfd);
1242 server.appendfd = fd;
1243 fsync(fd);
1244 server.appendseldb = -1; /* Make sure it will issue SELECT */
1245 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1246 } else {
1247 /* If append only is disabled we just generate a dump in this
1248 * format. Why not? */
1249 close(fd);
1250 }
1251 } else if (!bysignal && exitcode != 0) {
1252 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1253 } else {
1254 redisLog(REDIS_WARNING,
1255 "Background append only file rewriting terminated by signal");
1256 }
1257 cleanup:
1258 sdsfree(server.bgrewritebuf);
1259 server.bgrewritebuf = sdsempty();
1260 aofRemoveTempFile(server.bgrewritechildpid);
1261 server.bgrewritechildpid = -1;
1262 }
1263
1264 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1265 int j, loops = server.cronloops++;
1266 REDIS_NOTUSED(eventLoop);
1267 REDIS_NOTUSED(id);
1268 REDIS_NOTUSED(clientData);
1269
1270 /* We take a cached value of the unix time in the global state because
1271 * with virtual memory and aging there is to store the current time
1272 * in objects at every object access, and accuracy is not needed.
1273 * To access a global var is faster than calling time(NULL) */
1274 server.unixtime = time(NULL);
1275
1276 /* Show some info about non-empty databases */
1277 for (j = 0; j < server.dbnum; j++) {
1278 long long size, used, vkeys;
1279
1280 size = dictSlots(server.db[j].dict);
1281 used = dictSize(server.db[j].dict);
1282 vkeys = dictSize(server.db[j].expires);
1283 if (!(loops % 50) && (used || vkeys)) {
1284 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1285 /* dictPrintStats(server.dict); */
1286 }
1287 }
1288
1289 /* We don't want to resize the hash tables while a bacground saving
1290 * is in progress: the saving child is created using fork() that is
1291 * implemented with a copy-on-write semantic in most modern systems, so
1292 * if we resize the HT while there is the saving child at work actually
1293 * a lot of memory movements in the parent will cause a lot of pages
1294 * copied. */
1295 if (server.bgsavechildpid == -1 && !(loops % 10)) tryResizeHashTables();
1296
1297 /* Show information about connected clients */
1298 if (!(loops % 50)) {
1299 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1300 listLength(server.clients)-listLength(server.slaves),
1301 listLength(server.slaves),
1302 zmalloc_used_memory(),
1303 dictSize(server.sharingpool));
1304 }
1305
1306 /* Close connections of timedout clients */
1307 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1308 closeTimedoutClients();
1309
1310 /* Check if a background saving or AOF rewrite in progress terminated */
1311 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1312 int statloc;
1313 pid_t pid;
1314
1315 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1316 if (pid == server.bgsavechildpid) {
1317 backgroundSaveDoneHandler(statloc);
1318 } else {
1319 backgroundRewriteDoneHandler(statloc);
1320 }
1321 }
1322 } else {
1323 /* If there is not a background saving in progress check if
1324 * we have to save now */
1325 time_t now = time(NULL);
1326 for (j = 0; j < server.saveparamslen; j++) {
1327 struct saveparam *sp = server.saveparams+j;
1328
1329 if (server.dirty >= sp->changes &&
1330 now-server.lastsave > sp->seconds) {
1331 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1332 sp->changes, sp->seconds);
1333 rdbSaveBackground(server.dbfilename);
1334 break;
1335 }
1336 }
1337 }
1338
1339 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1340 * will use few CPU cycles if there are few expiring keys, otherwise
1341 * it will get more aggressive to avoid that too much memory is used by
1342 * keys that can be removed from the keyspace. */
1343 for (j = 0; j < server.dbnum; j++) {
1344 int expired;
1345 redisDb *db = server.db+j;
1346
1347 /* Continue to expire if at the end of the cycle more than 25%
1348 * of the keys were expired. */
1349 do {
1350 long num = dictSize(db->expires);
1351 time_t now = time(NULL);
1352
1353 expired = 0;
1354 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1355 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1356 while (num--) {
1357 dictEntry *de;
1358 time_t t;
1359
1360 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1361 t = (time_t) dictGetEntryVal(de);
1362 if (now > t) {
1363 deleteKey(db,dictGetEntryKey(de));
1364 expired++;
1365 server.stat_expiredkeys++;
1366 }
1367 }
1368 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1369 }
1370
1371 /* Swap a few keys on disk if we are over the memory limit and VM
1372 * is enbled. Try to free objects from the free list first. */
1373 if (vmCanSwapOut()) {
1374 while (server.vm_enabled && zmalloc_used_memory() >
1375 server.vm_max_memory)
1376 {
1377 int retval;
1378
1379 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1380 retval = (server.vm_max_threads == 0) ?
1381 vmSwapOneObjectBlocking() :
1382 vmSwapOneObjectThreaded();
1383 if (retval == REDIS_ERR && !(loops % 300) &&
1384 zmalloc_used_memory() >
1385 (server.vm_max_memory+server.vm_max_memory/10))
1386 {
1387 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1388 }
1389 /* Note that when using threade I/O we free just one object,
1390 * because anyway when the I/O thread in charge to swap this
1391 * object out will finish, the handler of completed jobs
1392 * will try to swap more objects if we are still out of memory. */
1393 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1394 }
1395 }
1396
1397 /* Check if we should connect to a MASTER */
1398 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1399 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1400 if (syncWithMaster() == REDIS_OK) {
1401 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1402 }
1403 }
1404 return 100;
1405 }
1406
1407 /* This function gets called every time Redis is entering the
1408 * main loop of the event driven library, that is, before to sleep
1409 * for ready file descriptors. */
1410 static void beforeSleep(struct aeEventLoop *eventLoop) {
1411 REDIS_NOTUSED(eventLoop);
1412
1413 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1414 listIter li;
1415 listNode *ln;
1416
1417 listRewind(server.io_ready_clients,&li);
1418 while((ln = listNext(&li))) {
1419 redisClient *c = ln->value;
1420 struct redisCommand *cmd;
1421
1422 /* Resume the client. */
1423 listDelNode(server.io_ready_clients,ln);
1424 c->flags &= (~REDIS_IO_WAIT);
1425 server.vm_blocked_clients--;
1426 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1427 readQueryFromClient, c);
1428 cmd = lookupCommand(c->argv[0]->ptr);
1429 assert(cmd != NULL);
1430 call(c,cmd);
1431 resetClient(c);
1432 /* There may be more data to process in the input buffer. */
1433 if (c->querybuf && sdslen(c->querybuf) > 0)
1434 processInputBuffer(c);
1435 }
1436 }
1437 }
1438
1439 static void createSharedObjects(void) {
1440 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1441 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1442 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1443 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1444 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1445 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1446 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1447 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1448 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1449 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1450 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1451 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1452 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1453 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1454 "-ERR no such key\r\n"));
1455 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1456 "-ERR syntax error\r\n"));
1457 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1458 "-ERR source and destination objects are the same\r\n"));
1459 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1460 "-ERR index out of range\r\n"));
1461 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1462 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1463 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1464 shared.select0 = createStringObject("select 0\r\n",10);
1465 shared.select1 = createStringObject("select 1\r\n",10);
1466 shared.select2 = createStringObject("select 2\r\n",10);
1467 shared.select3 = createStringObject("select 3\r\n",10);
1468 shared.select4 = createStringObject("select 4\r\n",10);
1469 shared.select5 = createStringObject("select 5\r\n",10);
1470 shared.select6 = createStringObject("select 6\r\n",10);
1471 shared.select7 = createStringObject("select 7\r\n",10);
1472 shared.select8 = createStringObject("select 8\r\n",10);
1473 shared.select9 = createStringObject("select 9\r\n",10);
1474 }
1475
1476 static void appendServerSaveParams(time_t seconds, int changes) {
1477 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1478 server.saveparams[server.saveparamslen].seconds = seconds;
1479 server.saveparams[server.saveparamslen].changes = changes;
1480 server.saveparamslen++;
1481 }
1482
1483 static void resetServerSaveParams() {
1484 zfree(server.saveparams);
1485 server.saveparams = NULL;
1486 server.saveparamslen = 0;
1487 }
1488
1489 static void initServerConfig() {
1490 server.dbnum = REDIS_DEFAULT_DBNUM;
1491 server.port = REDIS_SERVERPORT;
1492 server.verbosity = REDIS_VERBOSE;
1493 server.maxidletime = REDIS_MAXIDLETIME;
1494 server.saveparams = NULL;
1495 server.logfile = NULL; /* NULL = log on standard output */
1496 server.bindaddr = NULL;
1497 server.glueoutputbuf = 1;
1498 server.daemonize = 0;
1499 server.appendonly = 0;
1500 server.appendfsync = APPENDFSYNC_ALWAYS;
1501 server.lastfsync = time(NULL);
1502 server.appendfd = -1;
1503 server.appendseldb = -1; /* Make sure the first time will not match */
1504 server.pidfile = zstrdup("/var/run/redis.pid");
1505 server.dbfilename = zstrdup("dump.rdb");
1506 server.appendfilename = zstrdup("appendonly.aof");
1507 server.requirepass = NULL;
1508 server.shareobjects = 0;
1509 server.rdbcompression = 1;
1510 server.sharingpoolsize = 1024;
1511 server.maxclients = 0;
1512 server.blpop_blocked_clients = 0;
1513 server.maxmemory = 0;
1514 server.vm_enabled = 0;
1515 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1516 server.vm_page_size = 256; /* 256 bytes per page */
1517 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1518 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1519 server.vm_max_threads = 4;
1520 server.vm_blocked_clients = 0;
1521 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1522 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1523
1524 resetServerSaveParams();
1525
1526 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1527 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1528 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1529 /* Replication related */
1530 server.isslave = 0;
1531 server.masterauth = NULL;
1532 server.masterhost = NULL;
1533 server.masterport = 6379;
1534 server.master = NULL;
1535 server.replstate = REDIS_REPL_NONE;
1536
1537 /* Double constants initialization */
1538 R_Zero = 0.0;
1539 R_PosInf = 1.0/R_Zero;
1540 R_NegInf = -1.0/R_Zero;
1541 R_Nan = R_Zero/R_Zero;
1542 }
1543
1544 static void initServer() {
1545 int j;
1546
1547 signal(SIGHUP, SIG_IGN);
1548 signal(SIGPIPE, SIG_IGN);
1549 setupSigSegvAction();
1550
1551 server.devnull = fopen("/dev/null","w");
1552 if (server.devnull == NULL) {
1553 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1554 exit(1);
1555 }
1556 server.clients = listCreate();
1557 server.slaves = listCreate();
1558 server.monitors = listCreate();
1559 server.objfreelist = listCreate();
1560 createSharedObjects();
1561 server.el = aeCreateEventLoop();
1562 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1563 server.sharingpool = dictCreate(&setDictType,NULL);
1564 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1565 if (server.fd == -1) {
1566 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1567 exit(1);
1568 }
1569 for (j = 0; j < server.dbnum; j++) {
1570 server.db[j].dict = dictCreate(&dbDictType,NULL);
1571 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1572 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1573 if (server.vm_enabled)
1574 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1575 server.db[j].id = j;
1576 }
1577 server.cronloops = 0;
1578 server.bgsavechildpid = -1;
1579 server.bgrewritechildpid = -1;
1580 server.bgrewritebuf = sdsempty();
1581 server.lastsave = time(NULL);
1582 server.dirty = 0;
1583 server.stat_numcommands = 0;
1584 server.stat_numconnections = 0;
1585 server.stat_expiredkeys = 0;
1586 server.stat_starttime = time(NULL);
1587 server.unixtime = time(NULL);
1588 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1589 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1590 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1591
1592 if (server.appendonly) {
1593 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1594 if (server.appendfd == -1) {
1595 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1596 strerror(errno));
1597 exit(1);
1598 }
1599 }
1600
1601 if (server.vm_enabled) vmInit();
1602 }
1603
1604 /* Empty the whole database */
1605 static long long emptyDb() {
1606 int j;
1607 long long removed = 0;
1608
1609 for (j = 0; j < server.dbnum; j++) {
1610 removed += dictSize(server.db[j].dict);
1611 dictEmpty(server.db[j].dict);
1612 dictEmpty(server.db[j].expires);
1613 }
1614 return removed;
1615 }
1616
1617 static int yesnotoi(char *s) {
1618 if (!strcasecmp(s,"yes")) return 1;
1619 else if (!strcasecmp(s,"no")) return 0;
1620 else return -1;
1621 }
1622
1623 /* I agree, this is a very rudimental way to load a configuration...
1624 will improve later if the config gets more complex */
1625 static void loadServerConfig(char *filename) {
1626 FILE *fp;
1627 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1628 int linenum = 0;
1629 sds line = NULL;
1630 char *errormsg = "Fatal error, can't open config file '%s'";
1631 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1632 sprintf(errorbuf, errormsg, filename);
1633
1634 if (filename[0] == '-' && filename[1] == '\0')
1635 fp = stdin;
1636 else {
1637 if ((fp = fopen(filename,"r")) == NULL) {
1638 redisLog(REDIS_WARNING, errorbuf);
1639 exit(1);
1640 }
1641 }
1642
1643 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1644 sds *argv;
1645 int argc, j;
1646
1647 linenum++;
1648 line = sdsnew(buf);
1649 line = sdstrim(line," \t\r\n");
1650
1651 /* Skip comments and blank lines*/
1652 if (line[0] == '#' || line[0] == '\0') {
1653 sdsfree(line);
1654 continue;
1655 }
1656
1657 /* Split into arguments */
1658 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1659 sdstolower(argv[0]);
1660
1661 /* Execute config directives */
1662 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1663 server.maxidletime = atoi(argv[1]);
1664 if (server.maxidletime < 0) {
1665 err = "Invalid timeout value"; goto loaderr;
1666 }
1667 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1668 server.port = atoi(argv[1]);
1669 if (server.port < 1 || server.port > 65535) {
1670 err = "Invalid port"; goto loaderr;
1671 }
1672 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1673 server.bindaddr = zstrdup(argv[1]);
1674 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1675 int seconds = atoi(argv[1]);
1676 int changes = atoi(argv[2]);
1677 if (seconds < 1 || changes < 0) {
1678 err = "Invalid save parameters"; goto loaderr;
1679 }
1680 appendServerSaveParams(seconds,changes);
1681 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1682 if (chdir(argv[1]) == -1) {
1683 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1684 argv[1], strerror(errno));
1685 exit(1);
1686 }
1687 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1688 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1689 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1690 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1691 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1692 else {
1693 err = "Invalid log level. Must be one of debug, notice, warning";
1694 goto loaderr;
1695 }
1696 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1697 FILE *logfp;
1698
1699 server.logfile = zstrdup(argv[1]);
1700 if (!strcasecmp(server.logfile,"stdout")) {
1701 zfree(server.logfile);
1702 server.logfile = NULL;
1703 }
1704 if (server.logfile) {
1705 /* Test if we are able to open the file. The server will not
1706 * be able to abort just for this problem later... */
1707 logfp = fopen(server.logfile,"a");
1708 if (logfp == NULL) {
1709 err = sdscatprintf(sdsempty(),
1710 "Can't open the log file: %s", strerror(errno));
1711 goto loaderr;
1712 }
1713 fclose(logfp);
1714 }
1715 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1716 server.dbnum = atoi(argv[1]);
1717 if (server.dbnum < 1) {
1718 err = "Invalid number of databases"; goto loaderr;
1719 }
1720 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1721 loadServerConfig(argv[1]);
1722 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1723 server.maxclients = atoi(argv[1]);
1724 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1725 server.maxmemory = strtoll(argv[1], NULL, 10);
1726 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1727 server.masterhost = sdsnew(argv[1]);
1728 server.masterport = atoi(argv[2]);
1729 server.replstate = REDIS_REPL_CONNECT;
1730 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1731 server.masterauth = zstrdup(argv[1]);
1732 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1733 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1734 err = "argument must be 'yes' or 'no'"; goto loaderr;
1735 }
1736 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1737 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1738 err = "argument must be 'yes' or 'no'"; goto loaderr;
1739 }
1740 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1741 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1742 err = "argument must be 'yes' or 'no'"; goto loaderr;
1743 }
1744 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1745 server.sharingpoolsize = atoi(argv[1]);
1746 if (server.sharingpoolsize < 1) {
1747 err = "invalid object sharing pool size"; goto loaderr;
1748 }
1749 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1750 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1751 err = "argument must be 'yes' or 'no'"; goto loaderr;
1752 }
1753 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1754 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1755 err = "argument must be 'yes' or 'no'"; goto loaderr;
1756 }
1757 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1758 if (!strcasecmp(argv[1],"no")) {
1759 server.appendfsync = APPENDFSYNC_NO;
1760 } else if (!strcasecmp(argv[1],"always")) {
1761 server.appendfsync = APPENDFSYNC_ALWAYS;
1762 } else if (!strcasecmp(argv[1],"everysec")) {
1763 server.appendfsync = APPENDFSYNC_EVERYSEC;
1764 } else {
1765 err = "argument must be 'no', 'always' or 'everysec'";
1766 goto loaderr;
1767 }
1768 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1769 server.requirepass = zstrdup(argv[1]);
1770 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1771 zfree(server.pidfile);
1772 server.pidfile = zstrdup(argv[1]);
1773 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1774 zfree(server.dbfilename);
1775 server.dbfilename = zstrdup(argv[1]);
1776 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1777 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1778 err = "argument must be 'yes' or 'no'"; goto loaderr;
1779 }
1780 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1781 zfree(server.vm_swap_file);
1782 server.vm_swap_file = zstrdup(argv[1]);
1783 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1784 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1785 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1786 server.vm_page_size = strtoll(argv[1], NULL, 10);
1787 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1788 server.vm_pages = strtoll(argv[1], NULL, 10);
1789 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1790 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1791 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1792 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1793 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1794 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1795 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1796 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1797 } else {
1798 err = "Bad directive or wrong number of arguments"; goto loaderr;
1799 }
1800 for (j = 0; j < argc; j++)
1801 sdsfree(argv[j]);
1802 zfree(argv);
1803 sdsfree(line);
1804 }
1805 if (fp != stdin) fclose(fp);
1806 return;
1807
1808 loaderr:
1809 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1810 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1811 fprintf(stderr, ">>> '%s'\n", line);
1812 fprintf(stderr, "%s\n", err);
1813 exit(1);
1814 }
1815
1816 static void freeClientArgv(redisClient *c) {
1817 int j;
1818
1819 for (j = 0; j < c->argc; j++)
1820 decrRefCount(c->argv[j]);
1821 for (j = 0; j < c->mbargc; j++)
1822 decrRefCount(c->mbargv[j]);
1823 c->argc = 0;
1824 c->mbargc = 0;
1825 }
1826
1827 static void freeClient(redisClient *c) {
1828 listNode *ln;
1829
1830 /* Note that if the client we are freeing is blocked into a blocking
1831 * call, we have to set querybuf to NULL *before* to call
1832 * unblockClientWaitingData() to avoid processInputBuffer() will get
1833 * called. Also it is important to remove the file events after
1834 * this, because this call adds the READABLE event. */
1835 sdsfree(c->querybuf);
1836 c->querybuf = NULL;
1837 if (c->flags & REDIS_BLOCKED)
1838 unblockClientWaitingData(c);
1839
1840 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1841 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1842 listRelease(c->reply);
1843 freeClientArgv(c);
1844 close(c->fd);
1845 /* Remove from the list of clients */
1846 ln = listSearchKey(server.clients,c);
1847 redisAssert(ln != NULL);
1848 listDelNode(server.clients,ln);
1849 /* Remove from the list of clients waiting for swapped keys */
1850 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1851 ln = listSearchKey(server.io_ready_clients,c);
1852 if (ln) {
1853 listDelNode(server.io_ready_clients,ln);
1854 server.vm_blocked_clients--;
1855 }
1856 }
1857 while (server.vm_enabled && listLength(c->io_keys)) {
1858 ln = listFirst(c->io_keys);
1859 dontWaitForSwappedKey(c,ln->value);
1860 }
1861 listRelease(c->io_keys);
1862 /* Other cleanup */
1863 if (c->flags & REDIS_SLAVE) {
1864 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1865 close(c->repldbfd);
1866 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1867 ln = listSearchKey(l,c);
1868 redisAssert(ln != NULL);
1869 listDelNode(l,ln);
1870 }
1871 if (c->flags & REDIS_MASTER) {
1872 server.master = NULL;
1873 server.replstate = REDIS_REPL_CONNECT;
1874 }
1875 zfree(c->argv);
1876 zfree(c->mbargv);
1877 freeClientMultiState(c);
1878 zfree(c);
1879 }
1880
1881 #define GLUEREPLY_UP_TO (1024)
1882 static void glueReplyBuffersIfNeeded(redisClient *c) {
1883 int copylen = 0;
1884 char buf[GLUEREPLY_UP_TO];
1885 listNode *ln;
1886 listIter li;
1887 robj *o;
1888
1889 listRewind(c->reply,&li);
1890 while((ln = listNext(&li))) {
1891 int objlen;
1892
1893 o = ln->value;
1894 objlen = sdslen(o->ptr);
1895 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1896 memcpy(buf+copylen,o->ptr,objlen);
1897 copylen += objlen;
1898 listDelNode(c->reply,ln);
1899 } else {
1900 if (copylen == 0) return;
1901 break;
1902 }
1903 }
1904 /* Now the output buffer is empty, add the new single element */
1905 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1906 listAddNodeHead(c->reply,o);
1907 }
1908
1909 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1910 redisClient *c = privdata;
1911 int nwritten = 0, totwritten = 0, objlen;
1912 robj *o;
1913 REDIS_NOTUSED(el);
1914 REDIS_NOTUSED(mask);
1915
1916 /* Use writev() if we have enough buffers to send */
1917 if (!server.glueoutputbuf &&
1918 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1919 !(c->flags & REDIS_MASTER))
1920 {
1921 sendReplyToClientWritev(el, fd, privdata, mask);
1922 return;
1923 }
1924
1925 while(listLength(c->reply)) {
1926 if (server.glueoutputbuf && listLength(c->reply) > 1)
1927 glueReplyBuffersIfNeeded(c);
1928
1929 o = listNodeValue(listFirst(c->reply));
1930 objlen = sdslen(o->ptr);
1931
1932 if (objlen == 0) {
1933 listDelNode(c->reply,listFirst(c->reply));
1934 continue;
1935 }
1936
1937 if (c->flags & REDIS_MASTER) {
1938 /* Don't reply to a master */
1939 nwritten = objlen - c->sentlen;
1940 } else {
1941 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1942 if (nwritten <= 0) break;
1943 }
1944 c->sentlen += nwritten;
1945 totwritten += nwritten;
1946 /* If we fully sent the object on head go to the next one */
1947 if (c->sentlen == objlen) {
1948 listDelNode(c->reply,listFirst(c->reply));
1949 c->sentlen = 0;
1950 }
1951 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1952 * bytes, in a single threaded server it's a good idea to serve
1953 * other clients as well, even if a very large request comes from
1954 * super fast link that is always able to accept data (in real world
1955 * scenario think about 'KEYS *' against the loopback interfae) */
1956 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1957 }
1958 if (nwritten == -1) {
1959 if (errno == EAGAIN) {
1960 nwritten = 0;
1961 } else {
1962 redisLog(REDIS_VERBOSE,
1963 "Error writing to client: %s", strerror(errno));
1964 freeClient(c);
1965 return;
1966 }
1967 }
1968 if (totwritten > 0) c->lastinteraction = time(NULL);
1969 if (listLength(c->reply) == 0) {
1970 c->sentlen = 0;
1971 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1972 }
1973 }
1974
1975 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1976 {
1977 redisClient *c = privdata;
1978 int nwritten = 0, totwritten = 0, objlen, willwrite;
1979 robj *o;
1980 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1981 int offset, ion = 0;
1982 REDIS_NOTUSED(el);
1983 REDIS_NOTUSED(mask);
1984
1985 listNode *node;
1986 while (listLength(c->reply)) {
1987 offset = c->sentlen;
1988 ion = 0;
1989 willwrite = 0;
1990
1991 /* fill-in the iov[] array */
1992 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1993 o = listNodeValue(node);
1994 objlen = sdslen(o->ptr);
1995
1996 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1997 break;
1998
1999 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2000 break; /* no more iovecs */
2001
2002 iov[ion].iov_base = ((char*)o->ptr) + offset;
2003 iov[ion].iov_len = objlen - offset;
2004 willwrite += objlen - offset;
2005 offset = 0; /* just for the first item */
2006 ion++;
2007 }
2008
2009 if(willwrite == 0)
2010 break;
2011
2012 /* write all collected blocks at once */
2013 if((nwritten = writev(fd, iov, ion)) < 0) {
2014 if (errno != EAGAIN) {
2015 redisLog(REDIS_VERBOSE,
2016 "Error writing to client: %s", strerror(errno));
2017 freeClient(c);
2018 return;
2019 }
2020 break;
2021 }
2022
2023 totwritten += nwritten;
2024 offset = c->sentlen;
2025
2026 /* remove written robjs from c->reply */
2027 while (nwritten && listLength(c->reply)) {
2028 o = listNodeValue(listFirst(c->reply));
2029 objlen = sdslen(o->ptr);
2030
2031 if(nwritten >= objlen - offset) {
2032 listDelNode(c->reply, listFirst(c->reply));
2033 nwritten -= objlen - offset;
2034 c->sentlen = 0;
2035 } else {
2036 /* partial write */
2037 c->sentlen += nwritten;
2038 break;
2039 }
2040 offset = 0;
2041 }
2042 }
2043
2044 if (totwritten > 0)
2045 c->lastinteraction = time(NULL);
2046
2047 if (listLength(c->reply) == 0) {
2048 c->sentlen = 0;
2049 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2050 }
2051 }
2052
2053 static struct redisCommand *lookupCommand(char *name) {
2054 int j = 0;
2055 while(cmdTable[j].name != NULL) {
2056 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2057 j++;
2058 }
2059 return NULL;
2060 }
2061
2062 /* resetClient prepare the client to process the next command */
2063 static void resetClient(redisClient *c) {
2064 freeClientArgv(c);
2065 c->bulklen = -1;
2066 c->multibulk = 0;
2067 }
2068
2069 /* Call() is the core of Redis execution of a command */
2070 static void call(redisClient *c, struct redisCommand *cmd) {
2071 long long dirty;
2072
2073 dirty = server.dirty;
2074 cmd->proc(c);
2075 if (server.appendonly && server.dirty-dirty)
2076 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2077 if (server.dirty-dirty && listLength(server.slaves))
2078 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2079 if (listLength(server.monitors))
2080 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2081 server.stat_numcommands++;
2082 }
2083
2084 /* If this function gets called we already read a whole
2085 * command, argments are in the client argv/argc fields.
2086 * processCommand() execute the command or prepare the
2087 * server for a bulk read from the client.
2088 *
2089 * If 1 is returned the client is still alive and valid and
2090 * and other operations can be performed by the caller. Otherwise
2091 * if 0 is returned the client was destroied (i.e. after QUIT). */
2092 static int processCommand(redisClient *c) {
2093 struct redisCommand *cmd;
2094
2095 /* Free some memory if needed (maxmemory setting) */
2096 if (server.maxmemory) freeMemoryIfNeeded();
2097
2098 /* Handle the multi bulk command type. This is an alternative protocol
2099 * supported by Redis in order to receive commands that are composed of
2100 * multiple binary-safe "bulk" arguments. The latency of processing is
2101 * a bit higher but this allows things like multi-sets, so if this
2102 * protocol is used only for MSET and similar commands this is a big win. */
2103 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2104 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2105 if (c->multibulk <= 0) {
2106 resetClient(c);
2107 return 1;
2108 } else {
2109 decrRefCount(c->argv[c->argc-1]);
2110 c->argc--;
2111 return 1;
2112 }
2113 } else if (c->multibulk) {
2114 if (c->bulklen == -1) {
2115 if (((char*)c->argv[0]->ptr)[0] != '$') {
2116 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2117 resetClient(c);
2118 return 1;
2119 } else {
2120 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2121 decrRefCount(c->argv[0]);
2122 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2123 c->argc--;
2124 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2125 resetClient(c);
2126 return 1;
2127 }
2128 c->argc--;
2129 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2130 return 1;
2131 }
2132 } else {
2133 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2134 c->mbargv[c->mbargc] = c->argv[0];
2135 c->mbargc++;
2136 c->argc--;
2137 c->multibulk--;
2138 if (c->multibulk == 0) {
2139 robj **auxargv;
2140 int auxargc;
2141
2142 /* Here we need to swap the multi-bulk argc/argv with the
2143 * normal argc/argv of the client structure. */
2144 auxargv = c->argv;
2145 c->argv = c->mbargv;
2146 c->mbargv = auxargv;
2147
2148 auxargc = c->argc;
2149 c->argc = c->mbargc;
2150 c->mbargc = auxargc;
2151
2152 /* We need to set bulklen to something different than -1
2153 * in order for the code below to process the command without
2154 * to try to read the last argument of a bulk command as
2155 * a special argument. */
2156 c->bulklen = 0;
2157 /* continue below and process the command */
2158 } else {
2159 c->bulklen = -1;
2160 return 1;
2161 }
2162 }
2163 }
2164 /* -- end of multi bulk commands processing -- */
2165
2166 /* The QUIT command is handled as a special case. Normal command
2167 * procs are unable to close the client connection safely */
2168 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2169 freeClient(c);
2170 return 0;
2171 }
2172
2173 /* Now lookup the command and check ASAP about trivial error conditions
2174 * such wrong arity, bad command name and so forth. */
2175 cmd = lookupCommand(c->argv[0]->ptr);
2176 if (!cmd) {
2177 addReplySds(c,
2178 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2179 (char*)c->argv[0]->ptr));
2180 resetClient(c);
2181 return 1;
2182 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2183 (c->argc < -cmd->arity)) {
2184 addReplySds(c,
2185 sdscatprintf(sdsempty(),
2186 "-ERR wrong number of arguments for '%s' command\r\n",
2187 cmd->name));
2188 resetClient(c);
2189 return 1;
2190 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2191 /* This is a bulk command, we have to read the last argument yet. */
2192 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2193
2194 decrRefCount(c->argv[c->argc-1]);
2195 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2196 c->argc--;
2197 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2198 resetClient(c);
2199 return 1;
2200 }
2201 c->argc--;
2202 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2203 /* It is possible that the bulk read is already in the
2204 * buffer. Check this condition and handle it accordingly.
2205 * This is just a fast path, alternative to call processInputBuffer().
2206 * It's a good idea since the code is small and this condition
2207 * happens most of the times. */
2208 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2209 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2210 c->argc++;
2211 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2212 } else {
2213 /* Otherwise return... there is to read the last argument
2214 * from the socket. */
2215 return 1;
2216 }
2217 }
2218 /* Let's try to share objects on the command arguments vector */
2219 if (server.shareobjects) {
2220 int j;
2221 for(j = 1; j < c->argc; j++)
2222 c->argv[j] = tryObjectSharing(c->argv[j]);
2223 }
2224 /* Let's try to encode the bulk object to save space. */
2225 if (cmd->flags & REDIS_CMD_BULK)
2226 tryObjectEncoding(c->argv[c->argc-1]);
2227
2228 /* Check if the user is authenticated */
2229 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2230 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2231 resetClient(c);
2232 return 1;
2233 }
2234
2235 /* Handle the maxmemory directive */
2236 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2237 zmalloc_used_memory() > server.maxmemory)
2238 {
2239 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2240 resetClient(c);
2241 return 1;
2242 }
2243
2244 /* Exec the command */
2245 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2246 queueMultiCommand(c,cmd);
2247 addReply(c,shared.queued);
2248 } else {
2249 if (server.vm_enabled && server.vm_max_threads > 0 &&
2250 blockClientOnSwappedKeys(cmd,c)) return 1;
2251 call(c,cmd);
2252 }
2253
2254 /* Prepare the client for the next command */
2255 resetClient(c);
2256 return 1;
2257 }
2258
2259 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2260 listNode *ln;
2261 listIter li;
2262 int outc = 0, j;
2263 robj **outv;
2264 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2265 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2266 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2267 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2268 robj *lenobj;
2269
2270 if (argc <= REDIS_STATIC_ARGS) {
2271 outv = static_outv;
2272 } else {
2273 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2274 }
2275
2276 lenobj = createObject(REDIS_STRING,
2277 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2278 lenobj->refcount = 0;
2279 outv[outc++] = lenobj;
2280 for (j = 0; j < argc; j++) {
2281 lenobj = createObject(REDIS_STRING,
2282 sdscatprintf(sdsempty(),"$%lu\r\n",
2283 (unsigned long) stringObjectLen(argv[j])));
2284 lenobj->refcount = 0;
2285 outv[outc++] = lenobj;
2286 outv[outc++] = argv[j];
2287 outv[outc++] = shared.crlf;
2288 }
2289
2290 /* Increment all the refcounts at start and decrement at end in order to
2291 * be sure to free objects if there is no slave in a replication state
2292 * able to be feed with commands */
2293 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2294 listRewind(slaves,&li);
2295 while((ln = listNext(&li))) {
2296 redisClient *slave = ln->value;
2297
2298 /* Don't feed slaves that are still waiting for BGSAVE to start */
2299 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2300
2301 /* Feed all the other slaves, MONITORs and so on */
2302 if (slave->slaveseldb != dictid) {
2303 robj *selectcmd;
2304
2305 switch(dictid) {
2306 case 0: selectcmd = shared.select0; break;
2307 case 1: selectcmd = shared.select1; break;
2308 case 2: selectcmd = shared.select2; break;
2309 case 3: selectcmd = shared.select3; break;
2310 case 4: selectcmd = shared.select4; break;
2311 case 5: selectcmd = shared.select5; break;
2312 case 6: selectcmd = shared.select6; break;
2313 case 7: selectcmd = shared.select7; break;
2314 case 8: selectcmd = shared.select8; break;
2315 case 9: selectcmd = shared.select9; break;
2316 default:
2317 selectcmd = createObject(REDIS_STRING,
2318 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2319 selectcmd->refcount = 0;
2320 break;
2321 }
2322 addReply(slave,selectcmd);
2323 slave->slaveseldb = dictid;
2324 }
2325 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2326 }
2327 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2328 if (outv != static_outv) zfree(outv);
2329 }
2330
2331 static void processInputBuffer(redisClient *c) {
2332 again:
2333 /* Before to process the input buffer, make sure the client is not
2334 * waitig for a blocking operation such as BLPOP. Note that the first
2335 * iteration the client is never blocked, otherwise the processInputBuffer
2336 * would not be called at all, but after the execution of the first commands
2337 * in the input buffer the client may be blocked, and the "goto again"
2338 * will try to reiterate. The following line will make it return asap. */
2339 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2340 if (c->bulklen == -1) {
2341 /* Read the first line of the query */
2342 char *p = strchr(c->querybuf,'\n');
2343 size_t querylen;
2344
2345 if (p) {
2346 sds query, *argv;
2347 int argc, j;
2348
2349 query = c->querybuf;
2350 c->querybuf = sdsempty();
2351 querylen = 1+(p-(query));
2352 if (sdslen(query) > querylen) {
2353 /* leave data after the first line of the query in the buffer */
2354 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2355 }
2356 *p = '\0'; /* remove "\n" */
2357 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2358 sdsupdatelen(query);
2359
2360 /* Now we can split the query in arguments */
2361 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2362 sdsfree(query);
2363
2364 if (c->argv) zfree(c->argv);
2365 c->argv = zmalloc(sizeof(robj*)*argc);
2366
2367 for (j = 0; j < argc; j++) {
2368 if (sdslen(argv[j])) {
2369 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2370 c->argc++;
2371 } else {
2372 sdsfree(argv[j]);
2373 }
2374 }
2375 zfree(argv);
2376 if (c->argc) {
2377 /* Execute the command. If the client is still valid
2378 * after processCommand() return and there is something
2379 * on the query buffer try to process the next command. */
2380 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2381 } else {
2382 /* Nothing to process, argc == 0. Just process the query
2383 * buffer if it's not empty or return to the caller */
2384 if (sdslen(c->querybuf)) goto again;
2385 }
2386 return;
2387 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2388 redisLog(REDIS_VERBOSE, "Client protocol error");
2389 freeClient(c);
2390 return;
2391 }
2392 } else {
2393 /* Bulk read handling. Note that if we are at this point
2394 the client already sent a command terminated with a newline,
2395 we are reading the bulk data that is actually the last
2396 argument of the command. */
2397 int qbl = sdslen(c->querybuf);
2398
2399 if (c->bulklen <= qbl) {
2400 /* Copy everything but the final CRLF as final argument */
2401 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2402 c->argc++;
2403 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2404 /* Process the command. If the client is still valid after
2405 * the processing and there is more data in the buffer
2406 * try to parse it. */
2407 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2408 return;
2409 }
2410 }
2411 }
2412
2413 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2414 redisClient *c = (redisClient*) privdata;
2415 char buf[REDIS_IOBUF_LEN];
2416 int nread;
2417 REDIS_NOTUSED(el);
2418 REDIS_NOTUSED(mask);
2419
2420 nread = read(fd, buf, REDIS_IOBUF_LEN);
2421 if (nread == -1) {
2422 if (errno == EAGAIN) {
2423 nread = 0;
2424 } else {
2425 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2426 freeClient(c);
2427 return;
2428 }
2429 } else if (nread == 0) {
2430 redisLog(REDIS_VERBOSE, "Client closed connection");
2431 freeClient(c);
2432 return;
2433 }
2434 if (nread) {
2435 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2436 c->lastinteraction = time(NULL);
2437 } else {
2438 return;
2439 }
2440 processInputBuffer(c);
2441 }
2442
2443 static int selectDb(redisClient *c, int id) {
2444 if (id < 0 || id >= server.dbnum)
2445 return REDIS_ERR;
2446 c->db = &server.db[id];
2447 return REDIS_OK;
2448 }
2449
2450 static void *dupClientReplyValue(void *o) {
2451 incrRefCount((robj*)o);
2452 return o;
2453 }
2454
2455 static redisClient *createClient(int fd) {
2456 redisClient *c = zmalloc(sizeof(*c));
2457
2458 anetNonBlock(NULL,fd);
2459 anetTcpNoDelay(NULL,fd);
2460 if (!c) return NULL;
2461 selectDb(c,0);
2462 c->fd = fd;
2463 c->querybuf = sdsempty();
2464 c->argc = 0;
2465 c->argv = NULL;
2466 c->bulklen = -1;
2467 c->multibulk = 0;
2468 c->mbargc = 0;
2469 c->mbargv = NULL;
2470 c->sentlen = 0;
2471 c->flags = 0;
2472 c->lastinteraction = time(NULL);
2473 c->authenticated = 0;
2474 c->replstate = REDIS_REPL_NONE;
2475 c->reply = listCreate();
2476 listSetFreeMethod(c->reply,decrRefCount);
2477 listSetDupMethod(c->reply,dupClientReplyValue);
2478 c->blockingkeys = NULL;
2479 c->blockingkeysnum = 0;
2480 c->io_keys = listCreate();
2481 listSetFreeMethod(c->io_keys,decrRefCount);
2482 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2483 readQueryFromClient, c) == AE_ERR) {
2484 freeClient(c);
2485 return NULL;
2486 }
2487 listAddNodeTail(server.clients,c);
2488 initClientMultiState(c);
2489 return c;
2490 }
2491
2492 static void addReply(redisClient *c, robj *obj) {
2493 if (listLength(c->reply) == 0 &&
2494 (c->replstate == REDIS_REPL_NONE ||
2495 c->replstate == REDIS_REPL_ONLINE) &&
2496 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2497 sendReplyToClient, c) == AE_ERR) return;
2498
2499 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2500 obj = dupStringObject(obj);
2501 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2502 }
2503 listAddNodeTail(c->reply,getDecodedObject(obj));
2504 }
2505
2506 static void addReplySds(redisClient *c, sds s) {
2507 robj *o = createObject(REDIS_STRING,s);
2508 addReply(c,o);
2509 decrRefCount(o);
2510 }
2511
2512 static void addReplyDouble(redisClient *c, double d) {
2513 char buf[128];
2514
2515 snprintf(buf,sizeof(buf),"%.17g",d);
2516 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2517 (unsigned long) strlen(buf),buf));
2518 }
2519
2520 static void addReplyLong(redisClient *c, long l) {
2521 char buf[128];
2522 size_t len;
2523
2524 if (l == 0) {
2525 addReply(c,shared.czero);
2526 return;
2527 } else if (l == 1) {
2528 addReply(c,shared.cone);
2529 return;
2530 }
2531 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2532 addReplySds(c,sdsnewlen(buf,len));
2533 }
2534
2535 static void addReplyUlong(redisClient *c, unsigned long ul) {
2536 char buf[128];
2537 size_t len;
2538
2539 if (ul == 0) {
2540 addReply(c,shared.czero);
2541 return;
2542 } else if (ul == 1) {
2543 addReply(c,shared.cone);
2544 return;
2545 }
2546 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2547 addReplySds(c,sdsnewlen(buf,len));
2548 }
2549
2550 static void addReplyBulkLen(redisClient *c, robj *obj) {
2551 size_t len;
2552
2553 if (obj->encoding == REDIS_ENCODING_RAW) {
2554 len = sdslen(obj->ptr);
2555 } else {
2556 long n = (long)obj->ptr;
2557
2558 /* Compute how many bytes will take this integer as a radix 10 string */
2559 len = 1;
2560 if (n < 0) {
2561 len++;
2562 n = -n;
2563 }
2564 while((n = n/10) != 0) {
2565 len++;
2566 }
2567 }
2568 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2569 }
2570
2571 static void addReplyBulk(redisClient *c, robj *obj) {
2572 addReplyBulkLen(c,obj);
2573 addReply(c,obj);
2574 addReply(c,shared.crlf);
2575 }
2576
2577 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2578 static void addReplyBulkCString(redisClient *c, char *s) {
2579 if (s == NULL) {
2580 addReply(c,shared.nullbulk);
2581 } else {
2582 robj *o = createStringObject(s,strlen(s));
2583 addReplyBulk(c,o);
2584 decrRefCount(o);
2585 }
2586 }
2587
2588 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2589 int cport, cfd;
2590 char cip[128];
2591 redisClient *c;
2592 REDIS_NOTUSED(el);
2593 REDIS_NOTUSED(mask);
2594 REDIS_NOTUSED(privdata);
2595
2596 cfd = anetAccept(server.neterr, fd, cip, &cport);
2597 if (cfd == AE_ERR) {
2598 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2599 return;
2600 }
2601 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2602 if ((c = createClient(cfd)) == NULL) {
2603 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2604 close(cfd); /* May be already closed, just ingore errors */
2605 return;
2606 }
2607 /* If maxclient directive is set and this is one client more... close the
2608 * connection. Note that we create the client instead to check before
2609 * for this condition, since now the socket is already set in nonblocking
2610 * mode and we can send an error for free using the Kernel I/O */
2611 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2612 char *err = "-ERR max number of clients reached\r\n";
2613
2614 /* That's a best effort error message, don't check write errors */
2615 if (write(c->fd,err,strlen(err)) == -1) {
2616 /* Nothing to do, Just to avoid the warning... */
2617 }
2618 freeClient(c);
2619 return;
2620 }
2621 server.stat_numconnections++;
2622 }
2623
2624 /* ======================= Redis objects implementation ===================== */
2625
2626 static robj *createObject(int type, void *ptr) {
2627 robj *o;
2628
2629 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2630 if (listLength(server.objfreelist)) {
2631 listNode *head = listFirst(server.objfreelist);
2632 o = listNodeValue(head);
2633 listDelNode(server.objfreelist,head);
2634 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2635 } else {
2636 if (server.vm_enabled) {
2637 pthread_mutex_unlock(&server.obj_freelist_mutex);
2638 o = zmalloc(sizeof(*o));
2639 } else {
2640 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2641 }
2642 }
2643 o->type = type;
2644 o->encoding = REDIS_ENCODING_RAW;
2645 o->ptr = ptr;
2646 o->refcount = 1;
2647 if (server.vm_enabled) {
2648 /* Note that this code may run in the context of an I/O thread
2649 * and accessing to server.unixtime in theory is an error
2650 * (no locks). But in practice this is safe, and even if we read
2651 * garbage Redis will not fail, as it's just a statistical info */
2652 o->vm.atime = server.unixtime;
2653 o->storage = REDIS_VM_MEMORY;
2654 }
2655 return o;
2656 }
2657
2658 static robj *createStringObject(char *ptr, size_t len) {
2659 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2660 }
2661
2662 static robj *dupStringObject(robj *o) {
2663 assert(o->encoding == REDIS_ENCODING_RAW);
2664 return createStringObject(o->ptr,sdslen(o->ptr));
2665 }
2666
2667 static robj *createListObject(void) {
2668 list *l = listCreate();
2669
2670 listSetFreeMethod(l,decrRefCount);
2671 return createObject(REDIS_LIST,l);
2672 }
2673
2674 static robj *createSetObject(void) {
2675 dict *d = dictCreate(&setDictType,NULL);
2676 return createObject(REDIS_SET,d);
2677 }
2678
2679 static robj *createHashObject(void) {
2680 /* All the Hashes start as zipmaps. Will be automatically converted
2681 * into hash tables if there are enough elements or big elements
2682 * inside. */
2683 unsigned char *zm = zipmapNew();
2684 robj *o = createObject(REDIS_HASH,zm);
2685 o->encoding = REDIS_ENCODING_ZIPMAP;
2686 return o;
2687 }
2688
2689 static robj *createZsetObject(void) {
2690 zset *zs = zmalloc(sizeof(*zs));
2691
2692 zs->dict = dictCreate(&zsetDictType,NULL);
2693 zs->zsl = zslCreate();
2694 return createObject(REDIS_ZSET,zs);
2695 }
2696
2697 static void freeStringObject(robj *o) {
2698 if (o->encoding == REDIS_ENCODING_RAW) {
2699 sdsfree(o->ptr);
2700 }
2701 }
2702
2703 static void freeListObject(robj *o) {
2704 listRelease((list*) o->ptr);
2705 }
2706
2707 static void freeSetObject(robj *o) {
2708 dictRelease((dict*) o->ptr);
2709 }
2710
2711 static void freeZsetObject(robj *o) {
2712 zset *zs = o->ptr;
2713
2714 dictRelease(zs->dict);
2715 zslFree(zs->zsl);
2716 zfree(zs);
2717 }
2718
2719 static void freeHashObject(robj *o) {
2720 switch (o->encoding) {
2721 case REDIS_ENCODING_HT:
2722 dictRelease((dict*) o->ptr);
2723 break;
2724 case REDIS_ENCODING_ZIPMAP:
2725 zfree(o->ptr);
2726 break;
2727 default:
2728 redisAssert(0);
2729 break;
2730 }
2731 }
2732
2733 static void incrRefCount(robj *o) {
2734 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2735 o->refcount++;
2736 }
2737
2738 static void decrRefCount(void *obj) {
2739 robj *o = obj;
2740
2741 /* Object is a key of a swapped out value, or in the process of being
2742 * loaded. */
2743 if (server.vm_enabled &&
2744 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2745 {
2746 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2747 redisAssert(o->refcount == 1);
2748 }
2749 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2750 redisAssert(o->type == REDIS_STRING);
2751 freeStringObject(o);
2752 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2753 pthread_mutex_lock(&server.obj_freelist_mutex);
2754 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2755 !listAddNodeHead(server.objfreelist,o))
2756 zfree(o);
2757 pthread_mutex_unlock(&server.obj_freelist_mutex);
2758 server.vm_stats_swapped_objects--;
2759 return;
2760 }
2761 /* Object is in memory, or in the process of being swapped out. */
2762 if (--(o->refcount) == 0) {
2763 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2764 vmCancelThreadedIOJob(obj);
2765 switch(o->type) {
2766 case REDIS_STRING: freeStringObject(o); break;
2767 case REDIS_LIST: freeListObject(o); break;
2768 case REDIS_SET: freeSetObject(o); break;
2769 case REDIS_ZSET: freeZsetObject(o); break;
2770 case REDIS_HASH: freeHashObject(o); break;
2771 default: redisAssert(0); break;
2772 }
2773 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2774 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2775 !listAddNodeHead(server.objfreelist,o))
2776 zfree(o);
2777 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2778 }
2779 }
2780
2781 static robj *lookupKey(redisDb *db, robj *key) {
2782 dictEntry *de = dictFind(db->dict,key);
2783 if (de) {
2784 robj *key = dictGetEntryKey(de);
2785 robj *val = dictGetEntryVal(de);
2786
2787 if (server.vm_enabled) {
2788 if (key->storage == REDIS_VM_MEMORY ||
2789 key->storage == REDIS_VM_SWAPPING)
2790 {
2791 /* If we were swapping the object out, stop it, this key
2792 * was requested. */
2793 if (key->storage == REDIS_VM_SWAPPING)
2794 vmCancelThreadedIOJob(key);
2795 /* Update the access time of the key for the aging algorithm. */
2796 key->vm.atime = server.unixtime;
2797 } else {
2798 int notify = (key->storage == REDIS_VM_LOADING);
2799
2800 /* Our value was swapped on disk. Bring it at home. */
2801 redisAssert(val == NULL);
2802 val = vmLoadObject(key);
2803 dictGetEntryVal(de) = val;
2804
2805 /* Clients blocked by the VM subsystem may be waiting for
2806 * this key... */
2807 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2808 }
2809 }
2810 return val;
2811 } else {
2812 return NULL;
2813 }
2814 }
2815
2816 static robj *lookupKeyRead(redisDb *db, robj *key) {
2817 expireIfNeeded(db,key);
2818 return lookupKey(db,key);
2819 }
2820
2821 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2822 deleteIfVolatile(db,key);
2823 return lookupKey(db,key);
2824 }
2825
2826 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2827 robj *o = lookupKeyRead(c->db, key);
2828 if (!o) addReply(c,reply);
2829 return o;
2830 }
2831
2832 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2833 robj *o = lookupKeyWrite(c->db, key);
2834 if (!o) addReply(c,reply);
2835 return o;
2836 }
2837
2838 static int checkType(redisClient *c, robj *o, int type) {
2839 if (o->type != type) {
2840 addReply(c,shared.wrongtypeerr);
2841 return 1;
2842 }
2843 return 0;
2844 }
2845
2846 static int deleteKey(redisDb *db, robj *key) {
2847 int retval;
2848
2849 /* We need to protect key from destruction: after the first dictDelete()
2850 * it may happen that 'key' is no longer valid if we don't increment
2851 * it's count. This may happen when we get the object reference directly
2852 * from the hash table with dictRandomKey() or dict iterators */
2853 incrRefCount(key);
2854 if (dictSize(db->expires)) dictDelete(db->expires,key);
2855 retval = dictDelete(db->dict,key);
2856 decrRefCount(key);
2857
2858 return retval == DICT_OK;
2859 }
2860
2861 /* Try to share an object against the shared objects pool */
2862 static robj *tryObjectSharing(robj *o) {
2863 struct dictEntry *de;
2864 unsigned long c;
2865
2866 if (o == NULL || server.shareobjects == 0) return o;
2867
2868 redisAssert(o->type == REDIS_STRING);
2869 de = dictFind(server.sharingpool,o);
2870 if (de) {
2871 robj *shared = dictGetEntryKey(de);
2872
2873 c = ((unsigned long) dictGetEntryVal(de))+1;
2874 dictGetEntryVal(de) = (void*) c;
2875 incrRefCount(shared);
2876 decrRefCount(o);
2877 return shared;
2878 } else {
2879 /* Here we are using a stream algorihtm: Every time an object is
2880 * shared we increment its count, everytime there is a miss we
2881 * recrement the counter of a random object. If this object reaches
2882 * zero we remove the object and put the current object instead. */
2883 if (dictSize(server.sharingpool) >=
2884 server.sharingpoolsize) {
2885 de = dictGetRandomKey(server.sharingpool);
2886 redisAssert(de != NULL);
2887 c = ((unsigned long) dictGetEntryVal(de))-1;
2888 dictGetEntryVal(de) = (void*) c;
2889 if (c == 0) {
2890 dictDelete(server.sharingpool,de->key);
2891 }
2892 } else {
2893 c = 0; /* If the pool is empty we want to add this object */
2894 }
2895 if (c == 0) {
2896 int retval;
2897
2898 retval = dictAdd(server.sharingpool,o,(void*)1);
2899 redisAssert(retval == DICT_OK);
2900 incrRefCount(o);
2901 }
2902 return o;
2903 }
2904 }
2905
2906 /* Check if the nul-terminated string 's' can be represented by a long
2907 * (that is, is a number that fits into long without any other space or
2908 * character before or after the digits).
2909 *
2910 * If so, the function returns REDIS_OK and *longval is set to the value
2911 * of the number. Otherwise REDIS_ERR is returned */
2912 static int isStringRepresentableAsLong(sds s, long *longval) {
2913 char buf[32], *endptr;
2914 long value;
2915 int slen;
2916
2917 value = strtol(s, &endptr, 10);
2918 if (endptr[0] != '\0') return REDIS_ERR;
2919 slen = snprintf(buf,32,"%ld",value);
2920
2921 /* If the number converted back into a string is not identical
2922 * then it's not possible to encode the string as integer */
2923 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2924 if (longval) *longval = value;
2925 return REDIS_OK;
2926 }
2927
2928 /* Try to encode a string object in order to save space */
2929 static int tryObjectEncoding(robj *o) {
2930 long value;
2931 sds s = o->ptr;
2932
2933 if (o->encoding != REDIS_ENCODING_RAW)
2934 return REDIS_ERR; /* Already encoded */
2935
2936 /* It's not save to encode shared objects: shared objects can be shared
2937 * everywhere in the "object space" of Redis. Encoded objects can only
2938 * appear as "values" (and not, for instance, as keys) */
2939 if (o->refcount > 1) return REDIS_ERR;
2940
2941 /* Currently we try to encode only strings */
2942 redisAssert(o->type == REDIS_STRING);
2943
2944 /* Check if we can represent this string as a long integer */
2945 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2946
2947 /* Ok, this object can be encoded */
2948 o->encoding = REDIS_ENCODING_INT;
2949 sdsfree(o->ptr);
2950 o->ptr = (void*) value;
2951 return REDIS_OK;
2952 }
2953
2954 /* Get a decoded version of an encoded object (returned as a new object).
2955 * If the object is already raw-encoded just increment the ref count. */
2956 static robj *getDecodedObject(robj *o) {
2957 robj *dec;
2958
2959 if (o->encoding == REDIS_ENCODING_RAW) {
2960 incrRefCount(o);
2961 return o;
2962 }
2963 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2964 char buf[32];
2965
2966 snprintf(buf,32,"%ld",(long)o->ptr);
2967 dec = createStringObject(buf,strlen(buf));
2968 return dec;
2969 } else {
2970 redisAssert(1 != 1);
2971 }
2972 }
2973
2974 /* Compare two string objects via strcmp() or alike.
2975 * Note that the objects may be integer-encoded. In such a case we
2976 * use snprintf() to get a string representation of the numbers on the stack
2977 * and compare the strings, it's much faster than calling getDecodedObject().
2978 *
2979 * Important note: if objects are not integer encoded, but binary-safe strings,
2980 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2981 * binary safe. */
2982 static int compareStringObjects(robj *a, robj *b) {
2983 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2984 char bufa[128], bufb[128], *astr, *bstr;
2985 int bothsds = 1;
2986
2987 if (a == b) return 0;
2988 if (a->encoding != REDIS_ENCODING_RAW) {
2989 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2990 astr = bufa;
2991 bothsds = 0;
2992 } else {
2993 astr = a->ptr;
2994 }
2995 if (b->encoding != REDIS_ENCODING_RAW) {
2996 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2997 bstr = bufb;
2998 bothsds = 0;
2999 } else {
3000 bstr = b->ptr;
3001 }
3002 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3003 }
3004
3005 static size_t stringObjectLen(robj *o) {
3006 redisAssert(o->type == REDIS_STRING);
3007 if (o->encoding == REDIS_ENCODING_RAW) {
3008 return sdslen(o->ptr);
3009 } else {
3010 char buf[32];
3011
3012 return snprintf(buf,32,"%ld",(long)o->ptr);
3013 }
3014 }
3015
3016 /*============================ RDB saving/loading =========================== */
3017
3018 static int rdbSaveType(FILE *fp, unsigned char type) {
3019 if (fwrite(&type,1,1,fp) == 0) return -1;
3020 return 0;
3021 }
3022
3023 static int rdbSaveTime(FILE *fp, time_t t) {
3024 int32_t t32 = (int32_t) t;
3025 if (fwrite(&t32,4,1,fp) == 0) return -1;
3026 return 0;
3027 }
3028
3029 /* check rdbLoadLen() comments for more info */
3030 static int rdbSaveLen(FILE *fp, uint32_t len) {
3031 unsigned char buf[2];
3032
3033 if (len < (1<<6)) {
3034 /* Save a 6 bit len */
3035 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3036 if (fwrite(buf,1,1,fp) == 0) return -1;
3037 } else if (len < (1<<14)) {
3038 /* Save a 14 bit len */
3039 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3040 buf[1] = len&0xFF;
3041 if (fwrite(buf,2,1,fp) == 0) return -1;
3042 } else {
3043 /* Save a 32 bit len */
3044 buf[0] = (REDIS_RDB_32BITLEN<<6);
3045 if (fwrite(buf,1,1,fp) == 0) return -1;
3046 len = htonl(len);
3047 if (fwrite(&len,4,1,fp) == 0) return -1;
3048 }
3049 return 0;
3050 }
3051
3052 /* String objects in the form "2391" "-100" without any space and with a
3053 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3054 * encoded as integers to save space */
3055 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3056 long long value;
3057 char *endptr, buf[32];
3058
3059 /* Check if it's possible to encode this value as a number */
3060 value = strtoll(s, &endptr, 10);
3061 if (endptr[0] != '\0') return 0;
3062 snprintf(buf,32,"%lld",value);
3063
3064 /* If the number converted back into a string is not identical
3065 * then it's not possible to encode the string as integer */
3066 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3067
3068 /* Finally check if it fits in our ranges */
3069 if (value >= -(1<<7) && value <= (1<<7)-1) {
3070 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3071 enc[1] = value&0xFF;
3072 return 2;
3073 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3074 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3075 enc[1] = value&0xFF;
3076 enc[2] = (value>>8)&0xFF;
3077 return 3;
3078 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3079 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3080 enc[1] = value&0xFF;
3081 enc[2] = (value>>8)&0xFF;
3082 enc[3] = (value>>16)&0xFF;
3083 enc[4] = (value>>24)&0xFF;
3084 return 5;
3085 } else {
3086 return 0;
3087 }
3088 }
3089
3090 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3091 size_t comprlen, outlen;
3092 unsigned char byte;
3093 void *out;
3094
3095 /* We require at least four bytes compression for this to be worth it */
3096 if (len <= 4) return 0;
3097 outlen = len-4;
3098 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3099 comprlen = lzf_compress(s, len, out, outlen);
3100 if (comprlen == 0) {
3101 zfree(out);
3102 return 0;
3103 }
3104 /* Data compressed! Let's save it on disk */
3105 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3106 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3107 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3108 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3109 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3110 zfree(out);
3111 return comprlen;
3112
3113 writeerr:
3114 zfree(out);
3115 return -1;
3116 }
3117
3118 /* Save a string objet as [len][data] on disk. If the object is a string
3119 * representation of an integer value we try to safe it in a special form */
3120 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3121 int enclen;
3122
3123 /* Try integer encoding */
3124 if (len <= 11) {
3125 unsigned char buf[5];
3126 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3127 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3128 return 0;
3129 }
3130 }
3131
3132 /* Try LZF compression - under 20 bytes it's unable to compress even
3133 * aaaaaaaaaaaaaaaaaa so skip it */
3134 if (server.rdbcompression && len > 20) {
3135 int retval;
3136
3137 retval = rdbSaveLzfStringObject(fp,s,len);
3138 if (retval == -1) return -1;
3139 if (retval > 0) return 0;
3140 /* retval == 0 means data can't be compressed, save the old way */
3141 }
3142
3143 /* Store verbatim */
3144 if (rdbSaveLen(fp,len) == -1) return -1;
3145 if (len && fwrite(s,len,1,fp) == 0) return -1;
3146 return 0;
3147 }
3148
3149 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3150 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3151 int retval;
3152
3153 /* Avoid incr/decr ref count business when possible.
3154 * This plays well with copy-on-write given that we are probably
3155 * in a child process (BGSAVE). Also this makes sure key objects
3156 * of swapped objects are not incRefCount-ed (an assert does not allow
3157 * this in order to avoid bugs) */
3158 if (obj->encoding != REDIS_ENCODING_RAW) {
3159 obj = getDecodedObject(obj);
3160 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3161 decrRefCount(obj);
3162 } else {
3163 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3164 }
3165 return retval;
3166 }
3167
3168 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3169 * 8 bit integer specifing the length of the representation.
3170 * This 8 bit integer has special values in order to specify the following
3171 * conditions:
3172 * 253: not a number
3173 * 254: + inf
3174 * 255: - inf
3175 */
3176 static int rdbSaveDoubleValue(FILE *fp, double val) {
3177 unsigned char buf[128];
3178 int len;
3179
3180 if (isnan(val)) {
3181 buf[0] = 253;
3182 len = 1;
3183 } else if (!isfinite(val)) {
3184 len = 1;
3185 buf[0] = (val < 0) ? 255 : 254;
3186 } else {
3187 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3188 buf[0] = strlen((char*)buf+1);
3189 len = buf[0]+1;
3190 }
3191 if (fwrite(buf,len,1,fp) == 0) return -1;
3192 return 0;
3193 }
3194
3195 /* Save a Redis object. */
3196 static int rdbSaveObject(FILE *fp, robj *o) {
3197 if (o->type == REDIS_STRING) {
3198 /* Save a string value */
3199 if (rdbSaveStringObject(fp,o) == -1) return -1;
3200 } else if (o->type == REDIS_LIST) {
3201 /* Save a list value */
3202 list *list = o->ptr;
3203 listIter li;
3204 listNode *ln;
3205
3206 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3207 listRewind(list,&li);
3208 while((ln = listNext(&li))) {
3209 robj *eleobj = listNodeValue(ln);
3210
3211 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3212 }
3213 } else if (o->type == REDIS_SET) {
3214 /* Save a set value */
3215 dict *set = o->ptr;
3216 dictIterator *di = dictGetIterator(set);
3217 dictEntry *de;
3218
3219 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3220 while((de = dictNext(di)) != NULL) {
3221 robj *eleobj = dictGetEntryKey(de);
3222
3223 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3224 }
3225 dictReleaseIterator(di);
3226 } else if (o->type == REDIS_ZSET) {
3227 /* Save a set value */
3228 zset *zs = o->ptr;
3229 dictIterator *di = dictGetIterator(zs->dict);
3230 dictEntry *de;
3231
3232 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3233 while((de = dictNext(di)) != NULL) {
3234 robj *eleobj = dictGetEntryKey(de);
3235 double *score = dictGetEntryVal(de);
3236
3237 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3238 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3239 }
3240 dictReleaseIterator(di);
3241 } else if (o->type == REDIS_HASH) {
3242 /* Save a hash value */
3243 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3244 unsigned char *p = zipmapRewind(o->ptr);
3245 unsigned int count = zipmapLen(o->ptr);
3246 unsigned char *key, *val;
3247 unsigned int klen, vlen;
3248
3249 if (rdbSaveLen(fp,count) == -1) return -1;
3250 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3251 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3252 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3253 }
3254 } else {
3255 dictIterator *di = dictGetIterator(o->ptr);
3256 dictEntry *de;
3257
3258 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3259 while((de = dictNext(di)) != NULL) {
3260 robj *key = dictGetEntryKey(de);
3261 robj *val = dictGetEntryVal(de);
3262
3263 if (rdbSaveStringObject(fp,key) == -1) return -1;
3264 if (rdbSaveStringObject(fp,val) == -1) return -1;
3265 }
3266 dictReleaseIterator(di);
3267 }
3268 } else {
3269 redisAssert(0);
3270 }
3271 return 0;
3272 }
3273
3274 /* Return the length the object will have on disk if saved with
3275 * the rdbSaveObject() function. Currently we use a trick to get
3276 * this length with very little changes to the code. In the future
3277 * we could switch to a faster solution. */
3278 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3279 if (fp == NULL) fp = server.devnull;
3280 rewind(fp);
3281 assert(rdbSaveObject(fp,o) != 1);
3282 return ftello(fp);
3283 }
3284
3285 /* Return the number of pages required to save this object in the swap file */
3286 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3287 off_t bytes = rdbSavedObjectLen(o,fp);
3288
3289 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3290 }
3291
3292 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3293 static int rdbSave(char *filename) {
3294 dictIterator *di = NULL;
3295 dictEntry *de;
3296 FILE *fp;
3297 char tmpfile[256];
3298 int j;
3299 time_t now = time(NULL);
3300
3301 /* Wait for I/O therads to terminate, just in case this is a
3302 * foreground-saving, to avoid seeking the swap file descriptor at the
3303 * same time. */
3304 if (server.vm_enabled)
3305 waitEmptyIOJobsQueue();
3306
3307 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3308 fp = fopen(tmpfile,"w");
3309 if (!fp) {
3310 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3311 return REDIS_ERR;
3312 }
3313 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3314 for (j = 0; j < server.dbnum; j++) {
3315 redisDb *db = server.db+j;
3316 dict *d = db->dict;
3317 if (dictSize(d) == 0) continue;
3318 di = dictGetIterator(d);
3319 if (!di) {
3320 fclose(fp);
3321 return REDIS_ERR;
3322 }
3323
3324 /* Write the SELECT DB opcode */
3325 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3326 if (rdbSaveLen(fp,j) == -1) goto werr;
3327
3328 /* Iterate this DB writing every entry */
3329 while((de = dictNext(di)) != NULL) {
3330 robj *key = dictGetEntryKey(de);
3331 robj *o = dictGetEntryVal(de);
3332 time_t expiretime = getExpire(db,key);
3333
3334 /* Save the expire time */
3335 if (expiretime != -1) {
3336 /* If this key is already expired skip it */
3337 if (expiretime < now) continue;
3338 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3339 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3340 }
3341 /* Save the key and associated value. This requires special
3342 * handling if the value is swapped out. */
3343 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3344 key->storage == REDIS_VM_SWAPPING) {
3345 /* Save type, key, value */
3346 if (rdbSaveType(fp,o->type) == -1) goto werr;
3347 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3348 if (rdbSaveObject(fp,o) == -1) goto werr;
3349 } else {
3350 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3351 robj *po;
3352 /* Get a preview of the object in memory */
3353 po = vmPreviewObject(key);
3354 /* Save type, key, value */
3355 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3356 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3357 if (rdbSaveObject(fp,po) == -1) goto werr;
3358 /* Remove the loaded object from memory */
3359 decrRefCount(po);
3360 }
3361 }
3362 dictReleaseIterator(di);
3363 }
3364 /* EOF opcode */
3365 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3366
3367 /* Make sure data will not remain on the OS's output buffers */
3368 fflush(fp);
3369 fsync(fileno(fp));
3370 fclose(fp);
3371
3372 /* Use RENAME to make sure the DB file is changed atomically only
3373 * if the generate DB file is ok. */
3374 if (rename(tmpfile,filename) == -1) {
3375 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3376 unlink(tmpfile);
3377 return REDIS_ERR;
3378 }
3379 redisLog(REDIS_NOTICE,"DB saved on disk");
3380 server.dirty = 0;
3381 server.lastsave = time(NULL);
3382 return REDIS_OK;
3383
3384 werr:
3385 fclose(fp);
3386 unlink(tmpfile);
3387 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3388 if (di) dictReleaseIterator(di);
3389 return REDIS_ERR;
3390 }
3391
3392 static int rdbSaveBackground(char *filename) {
3393 pid_t childpid;
3394
3395 if (server.bgsavechildpid != -1) return REDIS_ERR;
3396 if (server.vm_enabled) waitEmptyIOJobsQueue();
3397 if ((childpid = fork()) == 0) {
3398 /* Child */
3399 if (server.vm_enabled) vmReopenSwapFile();
3400 close(server.fd);
3401 if (rdbSave(filename) == REDIS_OK) {
3402 _exit(0);
3403 } else {
3404 _exit(1);
3405 }
3406 } else {
3407 /* Parent */
3408 if (childpid == -1) {
3409 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3410 strerror(errno));
3411 return REDIS_ERR;
3412 }
3413 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3414 server.bgsavechildpid = childpid;
3415 return REDIS_OK;
3416 }
3417 return REDIS_OK; /* unreached */
3418 }
3419
3420 static void rdbRemoveTempFile(pid_t childpid) {
3421 char tmpfile[256];
3422
3423 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3424 unlink(tmpfile);
3425 }
3426
3427 static int rdbLoadType(FILE *fp) {
3428 unsigned char type;
3429 if (fread(&type,1,1,fp) == 0) return -1;
3430 return type;
3431 }
3432
3433 static time_t rdbLoadTime(FILE *fp) {
3434 int32_t t32;
3435 if (fread(&t32,4,1,fp) == 0) return -1;
3436 return (time_t) t32;
3437 }
3438
3439 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3440 * of this file for a description of how this are stored on disk.
3441 *
3442 * isencoded is set to 1 if the readed length is not actually a length but
3443 * an "encoding type", check the above comments for more info */
3444 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3445 unsigned char buf[2];
3446 uint32_t len;
3447 int type;
3448
3449 if (isencoded) *isencoded = 0;
3450 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3451 type = (buf[0]&0xC0)>>6;
3452 if (type == REDIS_RDB_6BITLEN) {
3453 /* Read a 6 bit len */
3454 return buf[0]&0x3F;
3455 } else if (type == REDIS_RDB_ENCVAL) {
3456 /* Read a 6 bit len encoding type */
3457 if (isencoded) *isencoded = 1;
3458 return buf[0]&0x3F;
3459 } else if (type == REDIS_RDB_14BITLEN) {
3460 /* Read a 14 bit len */
3461 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3462 return ((buf[0]&0x3F)<<8)|buf[1];
3463 } else {
3464 /* Read a 32 bit len */
3465 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3466 return ntohl(len);
3467 }
3468 }
3469
3470 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3471 unsigned char enc[4];
3472 long long val;
3473
3474 if (enctype == REDIS_RDB_ENC_INT8) {
3475 if (fread(enc,1,1,fp) == 0) return NULL;
3476 val = (signed char)enc[0];
3477 } else if (enctype == REDIS_RDB_ENC_INT16) {
3478 uint16_t v;
3479 if (fread(enc,2,1,fp) == 0) return NULL;
3480 v = enc[0]|(enc[1]<<8);
3481 val = (int16_t)v;
3482 } else if (enctype == REDIS_RDB_ENC_INT32) {
3483 uint32_t v;
3484 if (fread(enc,4,1,fp) == 0) return NULL;
3485 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3486 val = (int32_t)v;
3487 } else {
3488 val = 0; /* anti-warning */
3489 redisAssert(0);
3490 }
3491 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3492 }
3493
3494 static robj *rdbLoadLzfStringObject(FILE*fp) {
3495 unsigned int len, clen;
3496 unsigned char *c = NULL;
3497 sds val = NULL;
3498
3499 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3500 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3501 if ((c = zmalloc(clen)) == NULL) goto err;
3502 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3503 if (fread(c,clen,1,fp) == 0) goto err;
3504 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3505 zfree(c);
3506 return createObject(REDIS_STRING,val);
3507 err:
3508 zfree(c);
3509 sdsfree(val);
3510 return NULL;
3511 }
3512
3513 static robj *rdbLoadStringObject(FILE*fp) {
3514 int isencoded;
3515 uint32_t len;
3516 sds val;
3517
3518 len = rdbLoadLen(fp,&isencoded);
3519 if (isencoded) {
3520 switch(len) {
3521 case REDIS_RDB_ENC_INT8:
3522 case REDIS_RDB_ENC_INT16:
3523 case REDIS_RDB_ENC_INT32:
3524 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3525 case REDIS_RDB_ENC_LZF:
3526 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3527 default:
3528 redisAssert(0);
3529 }
3530 }
3531
3532 if (len == REDIS_RDB_LENERR) return NULL;
3533 val = sdsnewlen(NULL,len);
3534 if (len && fread(val,len,1,fp) == 0) {
3535 sdsfree(val);
3536 return NULL;
3537 }
3538 return tryObjectSharing(createObject(REDIS_STRING,val));
3539 }
3540
3541 /* For information about double serialization check rdbSaveDoubleValue() */
3542 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3543 char buf[128];
3544 unsigned char len;
3545
3546 if (fread(&len,1,1,fp) == 0) return -1;
3547 switch(len) {
3548 case 255: *val = R_NegInf; return 0;
3549 case 254: *val = R_PosInf; return 0;
3550 case 253: *val = R_Nan; return 0;
3551 default:
3552 if (fread(buf,len,1,fp) == 0) return -1;
3553 buf[len] = '\0';
3554 sscanf(buf, "%lg", val);
3555 return 0;
3556 }
3557 }
3558
3559 /* Load a Redis object of the specified type from the specified file.
3560 * On success a newly allocated object is returned, otherwise NULL. */
3561 static robj *rdbLoadObject(int type, FILE *fp) {
3562 robj *o;
3563
3564 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3565 if (type == REDIS_STRING) {
3566 /* Read string value */
3567 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3568 tryObjectEncoding(o);
3569 } else if (type == REDIS_LIST || type == REDIS_SET) {
3570 /* Read list/set value */
3571 uint32_t listlen;
3572
3573 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3574 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3575 /* It's faster to expand the dict to the right size asap in order
3576 * to avoid rehashing */
3577 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3578 dictExpand(o->ptr,listlen);
3579 /* Load every single element of the list/set */
3580 while(listlen--) {
3581 robj *ele;
3582
3583 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3584 tryObjectEncoding(ele);
3585 if (type == REDIS_LIST) {
3586 listAddNodeTail((list*)o->ptr,ele);
3587 } else {
3588 dictAdd((dict*)o->ptr,ele,NULL);
3589 }
3590 }
3591 } else if (type == REDIS_ZSET) {
3592 /* Read list/set value */
3593 size_t zsetlen;
3594 zset *zs;
3595
3596 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3597 o = createZsetObject();
3598 zs = o->ptr;
3599 /* Load every single element of the list/set */
3600 while(zsetlen--) {
3601 robj *ele;
3602 double *score = zmalloc(sizeof(double));
3603
3604 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3605 tryObjectEncoding(ele);
3606 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3607 dictAdd(zs->dict,ele,score);
3608 zslInsert(zs->zsl,*score,ele);
3609 incrRefCount(ele); /* added to skiplist */
3610 }
3611 } else if (type == REDIS_HASH) {
3612 size_t hashlen;
3613
3614 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3615 o = createHashObject();
3616 /* Too many entries? Use an hash table. */
3617 if (hashlen > server.hash_max_zipmap_entries)
3618 convertToRealHash(o);
3619 /* Load every key/value, then set it into the zipmap or hash
3620 * table, as needed. */
3621 while(hashlen--) {
3622 robj *key, *val;
3623
3624 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3625 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3626 /* If we are using a zipmap and there are too big values
3627 * the object is converted to real hash table encoding. */
3628 if (o->encoding != REDIS_ENCODING_HT &&
3629 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3630 sdslen(val->ptr) > server.hash_max_zipmap_value))
3631 {
3632 convertToRealHash(o);
3633 }
3634
3635 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3636 unsigned char *zm = o->ptr;
3637
3638 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3639 val->ptr,sdslen(val->ptr),NULL);
3640 o->ptr = zm;
3641 decrRefCount(key);
3642 decrRefCount(val);
3643 } else {
3644 tryObjectEncoding(key);
3645 tryObjectEncoding(val);
3646 dictAdd((dict*)o->ptr,key,val);
3647 }
3648 }
3649 } else {
3650 redisAssert(0);
3651 }
3652 return o;
3653 }
3654
3655 static int rdbLoad(char *filename) {
3656 FILE *fp;
3657 robj *keyobj = NULL;
3658 uint32_t dbid;
3659 int type, retval, rdbver;
3660 dict *d = server.db[0].dict;
3661 redisDb *db = server.db+0;
3662 char buf[1024];
3663 time_t expiretime = -1, now = time(NULL);
3664 long long loadedkeys = 0;
3665
3666 fp = fopen(filename,"r");
3667 if (!fp) return REDIS_ERR;
3668 if (fread(buf,9,1,fp) == 0) goto eoferr;
3669 buf[9] = '\0';
3670 if (memcmp(buf,"REDIS",5) != 0) {
3671 fclose(fp);
3672 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3673 return REDIS_ERR;
3674 }
3675 rdbver = atoi(buf+5);
3676 if (rdbver != 1) {
3677 fclose(fp);
3678 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3679 return REDIS_ERR;
3680 }
3681 while(1) {
3682 robj *o;
3683
3684 /* Read type. */
3685 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3686 if (type == REDIS_EXPIRETIME) {
3687 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3688 /* We read the time so we need to read the object type again */
3689 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3690 }
3691 if (type == REDIS_EOF) break;
3692 /* Handle SELECT DB opcode as a special case */
3693 if (type == REDIS_SELECTDB) {
3694 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3695 goto eoferr;
3696 if (dbid >= (unsigned)server.dbnum) {
3697 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3698 exit(1);
3699 }
3700 db = server.db+dbid;
3701 d = db->dict;
3702 continue;
3703 }
3704 /* Read key */
3705 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3706 /* Read value */
3707 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3708 /* Add the new object in the hash table */
3709 retval = dictAdd(d,keyobj,o);
3710 if (retval == DICT_ERR) {
3711 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3712 exit(1);
3713 }
3714 /* Set the expire time if needed */
3715 if (expiretime != -1) {
3716 setExpire(db,keyobj,expiretime);
3717 /* Delete this key if already expired */
3718 if (expiretime < now) deleteKey(db,keyobj);
3719 expiretime = -1;
3720 }
3721 keyobj = o = NULL;
3722 /* Handle swapping while loading big datasets when VM is on */
3723 loadedkeys++;
3724 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3725 while (zmalloc_used_memory() > server.vm_max_memory) {
3726 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3727 }
3728 }
3729 }
3730 fclose(fp);
3731 return REDIS_OK;
3732
3733 eoferr: /* unexpected end of file is handled here with a fatal exit */
3734 if (keyobj) decrRefCount(keyobj);
3735 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3736 exit(1);
3737 return REDIS_ERR; /* Just to avoid warning */
3738 }
3739
3740 /*================================== Commands =============================== */
3741
3742 static void authCommand(redisClient *c) {
3743 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3744 c->authenticated = 1;
3745 addReply(c,shared.ok);
3746 } else {
3747 c->authenticated = 0;
3748 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3749 }
3750 }
3751
3752 static void pingCommand(redisClient *c) {
3753 addReply(c,shared.pong);
3754 }
3755
3756 static void echoCommand(redisClient *c) {
3757 addReplyBulk(c,c->argv[1]);
3758 }
3759
3760 /*=================================== Strings =============================== */
3761
3762 static void setGenericCommand(redisClient *c, int nx) {
3763 int retval;
3764
3765 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3766 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3767 if (retval == DICT_ERR) {
3768 if (!nx) {
3769 /* If the key is about a swapped value, we want a new key object
3770 * to overwrite the old. So we delete the old key in the database.
3771 * This will also make sure that swap pages about the old object
3772 * will be marked as free. */
3773 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3774 incrRefCount(c->argv[1]);
3775 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3776 incrRefCount(c->argv[2]);
3777 } else {
3778 addReply(c,shared.czero);
3779 return;
3780 }
3781 } else {
3782 incrRefCount(c->argv[1]);
3783 incrRefCount(c->argv[2]);
3784 }
3785 server.dirty++;
3786 removeExpire(c->db,c->argv[1]);
3787 addReply(c, nx ? shared.cone : shared.ok);
3788 }
3789
3790 static void setCommand(redisClient *c) {
3791 setGenericCommand(c,0);
3792 }
3793
3794 static void setnxCommand(redisClient *c) {
3795 setGenericCommand(c,1);
3796 }
3797
3798 static int getGenericCommand(redisClient *c) {
3799 robj *o;
3800
3801 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3802 return REDIS_OK;
3803
3804 if (o->type != REDIS_STRING) {
3805 addReply(c,shared.wrongtypeerr);
3806 return REDIS_ERR;
3807 } else {
3808 addReplyBulk(c,o);
3809 return REDIS_OK;
3810 }
3811 }
3812
3813 static void getCommand(redisClient *c) {
3814 getGenericCommand(c);
3815 }
3816
3817 static void getsetCommand(redisClient *c) {
3818 if (getGenericCommand(c) == REDIS_ERR) return;
3819 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3820 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3821 } else {
3822 incrRefCount(c->argv[1]);
3823 }
3824 incrRefCount(c->argv[2]);
3825 server.dirty++;
3826 removeExpire(c->db,c->argv[1]);
3827 }
3828
3829 static void mgetCommand(redisClient *c) {
3830 int j;
3831
3832 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3833 for (j = 1; j < c->argc; j++) {
3834 robj *o = lookupKeyRead(c->db,c->argv[j]);
3835 if (o == NULL) {
3836 addReply(c,shared.nullbulk);
3837 } else {
3838 if (o->type != REDIS_STRING) {
3839 addReply(c,shared.nullbulk);
3840 } else {
3841 addReplyBulk(c,o);
3842 }
3843 }
3844 }
3845 }
3846
3847 static void msetGenericCommand(redisClient *c, int nx) {
3848 int j, busykeys = 0;
3849
3850 if ((c->argc % 2) == 0) {
3851 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3852 return;
3853 }
3854 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3855 * set nothing at all if at least one already key exists. */
3856 if (nx) {
3857 for (j = 1; j < c->argc; j += 2) {
3858 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3859 busykeys++;
3860 }
3861 }
3862 }
3863 if (busykeys) {
3864 addReply(c, shared.czero);
3865 return;
3866 }
3867
3868 for (j = 1; j < c->argc; j += 2) {
3869 int retval;
3870
3871 tryObjectEncoding(c->argv[j+1]);
3872 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3873 if (retval == DICT_ERR) {
3874 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3875 incrRefCount(c->argv[j+1]);
3876 } else {
3877 incrRefCount(c->argv[j]);
3878 incrRefCount(c->argv[j+1]);
3879 }
3880 removeExpire(c->db,c->argv[j]);
3881 }
3882 server.dirty += (c->argc-1)/2;
3883 addReply(c, nx ? shared.cone : shared.ok);
3884 }
3885
3886 static void msetCommand(redisClient *c) {
3887 msetGenericCommand(c,0);
3888 }
3889
3890 static void msetnxCommand(redisClient *c) {
3891 msetGenericCommand(c,1);
3892 }
3893
3894 static void incrDecrCommand(redisClient *c, long long incr) {
3895 long long value;
3896 int retval;
3897 robj *o;
3898
3899 o = lookupKeyWrite(c->db,c->argv[1]);
3900 if (o == NULL) {
3901 value = 0;
3902 } else {
3903 if (o->type != REDIS_STRING) {
3904 value = 0;
3905 } else {
3906 char *eptr;
3907
3908 if (o->encoding == REDIS_ENCODING_RAW)
3909 value = strtoll(o->ptr, &eptr, 10);
3910 else if (o->encoding == REDIS_ENCODING_INT)
3911 value = (long)o->ptr;
3912 else
3913 redisAssert(1 != 1);
3914 }
3915 }
3916
3917 value += incr;
3918 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3919 tryObjectEncoding(o);
3920 retval = dictAdd(c->db->dict,c->argv[1],o);
3921 if (retval == DICT_ERR) {
3922 dictReplace(c->db->dict,c->argv[1],o);
3923 removeExpire(c->db,c->argv[1]);
3924 } else {
3925 incrRefCount(c->argv[1]);
3926 }
3927 server.dirty++;
3928 addReply(c,shared.colon);
3929 addReply(c,o);
3930 addReply(c,shared.crlf);
3931 }
3932
3933 static void incrCommand(redisClient *c) {
3934 incrDecrCommand(c,1);
3935 }
3936
3937 static void decrCommand(redisClient *c) {
3938 incrDecrCommand(c,-1);
3939 }
3940
3941 static void incrbyCommand(redisClient *c) {
3942 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3943 incrDecrCommand(c,incr);
3944 }
3945
3946 static void decrbyCommand(redisClient *c) {
3947 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3948 incrDecrCommand(c,-incr);
3949 }
3950
3951 static void appendCommand(redisClient *c) {
3952 int retval;
3953 size_t totlen;
3954 robj *o;
3955
3956 o = lookupKeyWrite(c->db,c->argv[1]);
3957 if (o == NULL) {
3958 /* Create the key */
3959 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3960 incrRefCount(c->argv[1]);
3961 incrRefCount(c->argv[2]);
3962 totlen = stringObjectLen(c->argv[2]);
3963 } else {
3964 dictEntry *de;
3965
3966 de = dictFind(c->db->dict,c->argv[1]);
3967 assert(de != NULL);
3968
3969 o = dictGetEntryVal(de);
3970 if (o->type != REDIS_STRING) {
3971 addReply(c,shared.wrongtypeerr);
3972 return;
3973 }
3974 /* If the object is specially encoded or shared we have to make
3975 * a copy */
3976 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3977 robj *decoded = getDecodedObject(o);
3978
3979 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3980 decrRefCount(decoded);
3981 dictReplace(c->db->dict,c->argv[1],o);
3982 }
3983 /* APPEND! */
3984 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3985 o->ptr = sdscatlen(o->ptr,
3986 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3987 } else {
3988 o->ptr = sdscatprintf(o->ptr, "%ld",
3989 (unsigned long) c->argv[2]->ptr);
3990 }
3991 totlen = sdslen(o->ptr);
3992 }
3993 server.dirty++;
3994 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3995 }
3996
3997 static void substrCommand(redisClient *c) {
3998 robj *o;
3999 long start = atoi(c->argv[2]->ptr);
4000 long end = atoi(c->argv[3]->ptr);
4001 size_t rangelen, strlen;
4002 sds range;
4003
4004 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4005 checkType(c,o,REDIS_STRING)) return;
4006
4007 o = getDecodedObject(o);
4008 strlen = sdslen(o->ptr);
4009
4010 /* convert negative indexes */
4011 if (start < 0) start = strlen+start;
4012 if (end < 0) end = strlen+end;
4013 if (start < 0) start = 0;
4014 if (end < 0) end = 0;
4015
4016 /* indexes sanity checks */
4017 if (start > end || (size_t)start >= strlen) {
4018 /* Out of range start or start > end result in null reply */
4019 addReply(c,shared.nullbulk);
4020 decrRefCount(o);
4021 return;
4022 }
4023 if ((size_t)end >= strlen) end = strlen-1;
4024 rangelen = (end-start)+1;
4025
4026 /* Return the result */
4027 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4028 range = sdsnewlen((char*)o->ptr+start,rangelen);
4029 addReplySds(c,range);
4030 addReply(c,shared.crlf);
4031 decrRefCount(o);
4032 }
4033
4034 /* ========================= Type agnostic commands ========================= */
4035
4036 static void delCommand(redisClient *c) {
4037 int deleted = 0, j;
4038
4039 for (j = 1; j < c->argc; j++) {
4040 if (deleteKey(c->db,c->argv[j])) {
4041 server.dirty++;
4042 deleted++;
4043 }
4044 }
4045 addReplyLong(c,deleted);
4046 }
4047
4048 static void existsCommand(redisClient *c) {
4049 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4050 }
4051
4052 static void selectCommand(redisClient *c) {
4053 int id = atoi(c->argv[1]->ptr);
4054
4055 if (selectDb(c,id) == REDIS_ERR) {
4056 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4057 } else {
4058 addReply(c,shared.ok);
4059 }
4060 }
4061
4062 static void randomkeyCommand(redisClient *c) {
4063 dictEntry *de;
4064
4065 while(1) {
4066 de = dictGetRandomKey(c->db->dict);
4067 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4068 }
4069 if (de == NULL) {
4070 addReply(c,shared.plus);
4071 addReply(c,shared.crlf);
4072 } else {
4073 addReply(c,shared.plus);
4074 addReply(c,dictGetEntryKey(de));
4075 addReply(c,shared.crlf);
4076 }
4077 }
4078
4079 static void keysCommand(redisClient *c) {
4080 dictIterator *di;
4081 dictEntry *de;
4082 sds pattern = c->argv[1]->ptr;
4083 int plen = sdslen(pattern);
4084 unsigned long numkeys = 0;
4085 robj *lenobj = createObject(REDIS_STRING,NULL);
4086
4087 di = dictGetIterator(c->db->dict);
4088 addReply(c,lenobj);
4089 decrRefCount(lenobj);
4090 while((de = dictNext(di)) != NULL) {
4091 robj *keyobj = dictGetEntryKey(de);
4092
4093 sds key = keyobj->ptr;
4094 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4095 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4096 if (expireIfNeeded(c->db,keyobj) == 0) {
4097 addReplyBulk(c,keyobj);
4098 numkeys++;
4099 }
4100 }
4101 }
4102 dictReleaseIterator(di);
4103 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4104 }
4105
4106 static void dbsizeCommand(redisClient *c) {
4107 addReplySds(c,
4108 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4109 }
4110
4111 static void lastsaveCommand(redisClient *c) {
4112 addReplySds(c,
4113 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4114 }
4115
4116 static void typeCommand(redisClient *c) {
4117 robj *o;
4118 char *type;
4119
4120 o = lookupKeyRead(c->db,c->argv[1]);
4121 if (o == NULL) {
4122 type = "+none";
4123 } else {
4124 switch(o->type) {
4125 case REDIS_STRING: type = "+string"; break;
4126 case REDIS_LIST: type = "+list"; break;
4127 case REDIS_SET: type = "+set"; break;
4128 case REDIS_ZSET: type = "+zset"; break;
4129 case REDIS_HASH: type = "+hash"; break;
4130 default: type = "+unknown"; break;
4131 }
4132 }
4133 addReplySds(c,sdsnew(type));
4134 addReply(c,shared.crlf);
4135 }
4136
4137 static void saveCommand(redisClient *c) {
4138 if (server.bgsavechildpid != -1) {
4139 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4140 return;
4141 }
4142 if (rdbSave(server.dbfilename) == REDIS_OK) {
4143 addReply(c,shared.ok);
4144 } else {
4145 addReply(c,shared.err);
4146 }
4147 }
4148
4149 static void bgsaveCommand(redisClient *c) {
4150 if (server.bgsavechildpid != -1) {
4151 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4152 return;
4153 }
4154 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4155 char *status = "+Background saving started\r\n";
4156 addReplySds(c,sdsnew(status));
4157 } else {
4158 addReply(c,shared.err);
4159 }
4160 }
4161
4162 static void shutdownCommand(redisClient *c) {
4163 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4164 /* Kill the saving child if there is a background saving in progress.
4165 We want to avoid race conditions, for instance our saving child may
4166 overwrite the synchronous saving did by SHUTDOWN. */
4167 if (server.bgsavechildpid != -1) {
4168 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4169 kill(server.bgsavechildpid,SIGKILL);
4170 rdbRemoveTempFile(server.bgsavechildpid);
4171 }
4172 if (server.appendonly) {
4173 /* Append only file: fsync() the AOF and exit */
4174 fsync(server.appendfd);
4175 if (server.vm_enabled) unlink(server.vm_swap_file);
4176 exit(0);
4177 } else {
4178 /* Snapshotting. Perform a SYNC SAVE and exit */
4179 if (rdbSave(server.dbfilename) == REDIS_OK) {
4180 if (server.daemonize)
4181 unlink(server.pidfile);
4182 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4183 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4184 if (server.vm_enabled) unlink(server.vm_swap_file);
4185 exit(0);
4186 } else {
4187 /* Ooops.. error saving! The best we can do is to continue
4188 * operating. Note that if there was a background saving process,
4189 * in the next cron() Redis will be notified that the background
4190 * saving aborted, handling special stuff like slaves pending for
4191 * synchronization... */
4192 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4193 addReplySds(c,
4194 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4195 }
4196 }
4197 }
4198
4199 static void renameGenericCommand(redisClient *c, int nx) {
4200 robj *o;
4201
4202 /* To use the same key as src and dst is probably an error */
4203 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4204 addReply(c,shared.sameobjecterr);
4205 return;
4206 }
4207
4208 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4209 return;
4210
4211 incrRefCount(o);
4212 deleteIfVolatile(c->db,c->argv[2]);
4213 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4214 if (nx) {
4215 decrRefCount(o);
4216 addReply(c,shared.czero);
4217 return;
4218 }
4219 dictReplace(c->db->dict,c->argv[2],o);
4220 } else {
4221 incrRefCount(c->argv[2]);
4222 }
4223 deleteKey(c->db,c->argv[1]);
4224 server.dirty++;
4225 addReply(c,nx ? shared.cone : shared.ok);
4226 }
4227
4228 static void renameCommand(redisClient *c) {
4229 renameGenericCommand(c,0);
4230 }
4231
4232 static void renamenxCommand(redisClient *c) {
4233 renameGenericCommand(c,1);
4234 }
4235
4236 static void moveCommand(redisClient *c) {
4237 robj *o;
4238 redisDb *src, *dst;
4239 int srcid;
4240
4241 /* Obtain source and target DB pointers */
4242 src = c->db;
4243 srcid = c->db->id;
4244 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4245 addReply(c,shared.outofrangeerr);
4246 return;
4247 }
4248 dst = c->db;
4249 selectDb(c,srcid); /* Back to the source DB */
4250
4251 /* If the user is moving using as target the same
4252 * DB as the source DB it is probably an error. */
4253 if (src == dst) {
4254 addReply(c,shared.sameobjecterr);
4255 return;
4256 }
4257
4258 /* Check if the element exists and get a reference */
4259 o = lookupKeyWrite(c->db,c->argv[1]);
4260 if (!o) {
4261 addReply(c,shared.czero);
4262 return;
4263 }
4264
4265 /* Try to add the element to the target DB */
4266 deleteIfVolatile(dst,c->argv[1]);
4267 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4268 addReply(c,shared.czero);
4269 return;
4270 }
4271 incrRefCount(c->argv[1]);
4272 incrRefCount(o);
4273
4274 /* OK! key moved, free the entry in the source DB */
4275 deleteKey(src,c->argv[1]);
4276 server.dirty++;
4277 addReply(c,shared.cone);
4278 }
4279
4280 /* =================================== Lists ================================ */
4281 static void pushGenericCommand(redisClient *c, int where) {
4282 robj *lobj;
4283 list *list;
4284
4285 lobj = lookupKeyWrite(c->db,c->argv[1]);
4286 if (lobj == NULL) {
4287 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4288 addReply(c,shared.cone);
4289 return;
4290 }
4291 lobj = createListObject();
4292 list = lobj->ptr;
4293 if (where == REDIS_HEAD) {
4294 listAddNodeHead(list,c->argv[2]);
4295 } else {
4296 listAddNodeTail(list,c->argv[2]);
4297 }
4298 dictAdd(c->db->dict,c->argv[1],lobj);
4299 incrRefCount(c->argv[1]);
4300 incrRefCount(c->argv[2]);
4301 } else {
4302 if (lobj->type != REDIS_LIST) {
4303 addReply(c,shared.wrongtypeerr);
4304 return;
4305 }
4306 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4307 addReply(c,shared.cone);
4308 return;
4309 }
4310 list = lobj->ptr;
4311 if (where == REDIS_HEAD) {
4312 listAddNodeHead(list,c->argv[2]);
4313 } else {
4314 listAddNodeTail(list,c->argv[2]);
4315 }
4316 incrRefCount(c->argv[2]);
4317 }
4318 server.dirty++;
4319 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4320 }
4321
4322 static void lpushCommand(redisClient *c) {
4323 pushGenericCommand(c,REDIS_HEAD);
4324 }
4325
4326 static void rpushCommand(redisClient *c) {
4327 pushGenericCommand(c,REDIS_TAIL);
4328 }
4329
4330 static void llenCommand(redisClient *c) {
4331 robj *o;
4332 list *l;
4333
4334 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4335 checkType(c,o,REDIS_LIST)) return;
4336
4337 l = o->ptr;
4338 addReplyUlong(c,listLength(l));
4339 }
4340
4341 static void lindexCommand(redisClient *c) {
4342 robj *o;
4343 int index = atoi(c->argv[2]->ptr);
4344 list *list;
4345 listNode *ln;
4346
4347 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4348 checkType(c,o,REDIS_LIST)) return;
4349 list = o->ptr;
4350
4351 ln = listIndex(list, index);
4352 if (ln == NULL) {
4353 addReply(c,shared.nullbulk);
4354 } else {
4355 robj *ele = listNodeValue(ln);
4356 addReplyBulk(c,ele);
4357 }
4358 }
4359
4360 static void lsetCommand(redisClient *c) {
4361 robj *o;
4362 int index = atoi(c->argv[2]->ptr);
4363 list *list;
4364 listNode *ln;
4365
4366 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4367 checkType(c,o,REDIS_LIST)) return;
4368 list = o->ptr;
4369
4370 ln = listIndex(list, index);
4371 if (ln == NULL) {
4372 addReply(c,shared.outofrangeerr);
4373 } else {
4374 robj *ele = listNodeValue(ln);
4375
4376 decrRefCount(ele);
4377 listNodeValue(ln) = c->argv[3];
4378 incrRefCount(c->argv[3]);
4379 addReply(c,shared.ok);
4380 server.dirty++;
4381 }
4382 }
4383
4384 static void popGenericCommand(redisClient *c, int where) {
4385 robj *o;
4386 list *list;
4387 listNode *ln;
4388
4389 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4390 checkType(c,o,REDIS_LIST)) return;
4391 list = o->ptr;
4392
4393 if (where == REDIS_HEAD)
4394 ln = listFirst(list);
4395 else
4396 ln = listLast(list);
4397
4398 if (ln == NULL) {
4399 addReply(c,shared.nullbulk);
4400 } else {
4401 robj *ele = listNodeValue(ln);
4402 addReplyBulk(c,ele);
4403 listDelNode(list,ln);
4404 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4405 server.dirty++;
4406 }
4407 }
4408
4409 static void lpopCommand(redisClient *c) {
4410 popGenericCommand(c,REDIS_HEAD);
4411 }
4412
4413 static void rpopCommand(redisClient *c) {
4414 popGenericCommand(c,REDIS_TAIL);
4415 }
4416
4417 static void lrangeCommand(redisClient *c) {
4418 robj *o;
4419 int start = atoi(c->argv[2]->ptr);
4420 int end = atoi(c->argv[3]->ptr);
4421 int llen;
4422 int rangelen, j;
4423 list *list;
4424 listNode *ln;
4425 robj *ele;
4426
4427 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4428 checkType(c,o,REDIS_LIST)) return;
4429 list = o->ptr;
4430 llen = listLength(list);
4431
4432 /* convert negative indexes */
4433 if (start < 0) start = llen+start;
4434 if (end < 0) end = llen+end;
4435 if (start < 0) start = 0;
4436 if (end < 0) end = 0;
4437
4438 /* indexes sanity checks */
4439 if (start > end || start >= llen) {
4440 /* Out of range start or start > end result in empty list */
4441 addReply(c,shared.emptymultibulk);
4442 return;
4443 }
4444 if (end >= llen) end = llen-1;
4445 rangelen = (end-start)+1;
4446
4447 /* Return the result in form of a multi-bulk reply */
4448 ln = listIndex(list, start);
4449 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4450 for (j = 0; j < rangelen; j++) {
4451 ele = listNodeValue(ln);
4452 addReplyBulk(c,ele);
4453 ln = ln->next;
4454 }
4455 }
4456
4457 static void ltrimCommand(redisClient *c) {
4458 robj *o;
4459 int start = atoi(c->argv[2]->ptr);
4460 int end = atoi(c->argv[3]->ptr);
4461 int llen;
4462 int j, ltrim, rtrim;
4463 list *list;
4464 listNode *ln;
4465
4466 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4467 checkType(c,o,REDIS_LIST)) return;
4468 list = o->ptr;
4469 llen = listLength(list);
4470
4471 /* convert negative indexes */
4472 if (start < 0) start = llen+start;
4473 if (end < 0) end = llen+end;
4474 if (start < 0) start = 0;
4475 if (end < 0) end = 0;
4476
4477 /* indexes sanity checks */
4478 if (start > end || start >= llen) {
4479 /* Out of range start or start > end result in empty list */
4480 ltrim = llen;
4481 rtrim = 0;
4482 } else {
4483 if (end >= llen) end = llen-1;
4484 ltrim = start;
4485 rtrim = llen-end-1;
4486 }
4487
4488 /* Remove list elements to perform the trim */
4489 for (j = 0; j < ltrim; j++) {
4490 ln = listFirst(list);
4491 listDelNode(list,ln);
4492 }
4493 for (j = 0; j < rtrim; j++) {
4494 ln = listLast(list);
4495 listDelNode(list,ln);
4496 }
4497 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4498 server.dirty++;
4499 addReply(c,shared.ok);
4500 }
4501
4502 static void lremCommand(redisClient *c) {
4503 robj *o;
4504 list *list;
4505 listNode *ln, *next;
4506 int toremove = atoi(c->argv[2]->ptr);
4507 int removed = 0;
4508 int fromtail = 0;
4509
4510 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4511 checkType(c,o,REDIS_LIST)) return;
4512 list = o->ptr;
4513
4514 if (toremove < 0) {
4515 toremove = -toremove;
4516 fromtail = 1;
4517 }
4518 ln = fromtail ? list->tail : list->head;
4519 while (ln) {
4520 robj *ele = listNodeValue(ln);
4521
4522 next = fromtail ? ln->prev : ln->next;
4523 if (compareStringObjects(ele,c->argv[3]) == 0) {
4524 listDelNode(list,ln);
4525 server.dirty++;
4526 removed++;
4527 if (toremove && removed == toremove) break;
4528 }
4529 ln = next;
4530 }
4531 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4532 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4533 }
4534
4535 /* This is the semantic of this command:
4536 * RPOPLPUSH srclist dstlist:
4537 * IF LLEN(srclist) > 0
4538 * element = RPOP srclist
4539 * LPUSH dstlist element
4540 * RETURN element
4541 * ELSE
4542 * RETURN nil
4543 * END
4544 * END
4545 *
4546 * The idea is to be able to get an element from a list in a reliable way
4547 * since the element is not just returned but pushed against another list
4548 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4549 */
4550 static void rpoplpushcommand(redisClient *c) {
4551 robj *sobj;
4552 list *srclist;
4553 listNode *ln;
4554
4555 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4556 checkType(c,sobj,REDIS_LIST)) return;
4557 srclist = sobj->ptr;
4558 ln = listLast(srclist);
4559
4560 if (ln == NULL) {
4561 addReply(c,shared.nullbulk);
4562 } else {
4563 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4564 robj *ele = listNodeValue(ln);
4565 list *dstlist;
4566
4567 if (dobj && dobj->type != REDIS_LIST) {
4568 addReply(c,shared.wrongtypeerr);
4569 return;
4570 }
4571
4572 /* Add the element to the target list (unless it's directly
4573 * passed to some BLPOP-ing client */
4574 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4575 if (dobj == NULL) {
4576 /* Create the list if the key does not exist */
4577 dobj = createListObject();
4578 dictAdd(c->db->dict,c->argv[2],dobj);
4579 incrRefCount(c->argv[2]);
4580 }
4581 dstlist = dobj->ptr;
4582 listAddNodeHead(dstlist,ele);
4583 incrRefCount(ele);
4584 }
4585
4586 /* Send the element to the client as reply as well */
4587 addReplyBulk(c,ele);
4588
4589 /* Finally remove the element from the source list */
4590 listDelNode(srclist,ln);
4591 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4592 server.dirty++;
4593 }
4594 }
4595
4596 /* ==================================== Sets ================================ */
4597
4598 static void saddCommand(redisClient *c) {
4599 robj *set;
4600
4601 set = lookupKeyWrite(c->db,c->argv[1]);
4602 if (set == NULL) {
4603 set = createSetObject();
4604 dictAdd(c->db->dict,c->argv[1],set);
4605 incrRefCount(c->argv[1]);
4606 } else {
4607 if (set->type != REDIS_SET) {
4608 addReply(c,shared.wrongtypeerr);
4609 return;
4610 }
4611 }
4612 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4613 incrRefCount(c->argv[2]);
4614 server.dirty++;
4615 addReply(c,shared.cone);
4616 } else {
4617 addReply(c,shared.czero);
4618 }
4619 }
4620
4621 static void sremCommand(redisClient *c) {
4622 robj *set;
4623
4624 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4625 checkType(c,set,REDIS_SET)) return;
4626
4627 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4628 server.dirty++;
4629 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4630 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4631 addReply(c,shared.cone);
4632 } else {
4633 addReply(c,shared.czero);
4634 }
4635 }
4636
4637 static void smoveCommand(redisClient *c) {
4638 robj *srcset, *dstset;
4639
4640 srcset = lookupKeyWrite(c->db,c->argv[1]);
4641 dstset = lookupKeyWrite(c->db,c->argv[2]);
4642
4643 /* If the source key does not exist return 0, if it's of the wrong type
4644 * raise an error */
4645 if (srcset == NULL || srcset->type != REDIS_SET) {
4646 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4647 return;
4648 }
4649 /* Error if the destination key is not a set as well */
4650 if (dstset && dstset->type != REDIS_SET) {
4651 addReply(c,shared.wrongtypeerr);
4652 return;
4653 }
4654 /* Remove the element from the source set */
4655 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4656 /* Key not found in the src set! return zero */
4657 addReply(c,shared.czero);
4658 return;
4659 }
4660 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4661 deleteKey(c->db,c->argv[1]);
4662 server.dirty++;
4663 /* Add the element to the destination set */
4664 if (!dstset) {
4665 dstset = createSetObject();
4666 dictAdd(c->db->dict,c->argv[2],dstset);
4667 incrRefCount(c->argv[2]);
4668 }
4669 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4670 incrRefCount(c->argv[3]);
4671 addReply(c,shared.cone);
4672 }
4673
4674 static void sismemberCommand(redisClient *c) {
4675 robj *set;
4676
4677 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4678 checkType(c,set,REDIS_SET)) return;
4679
4680 if (dictFind(set->ptr,c->argv[2]))
4681 addReply(c,shared.cone);
4682 else
4683 addReply(c,shared.czero);
4684 }
4685
4686 static void scardCommand(redisClient *c) {
4687 robj *o;
4688 dict *s;
4689
4690 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4691 checkType(c,o,REDIS_SET)) return;
4692
4693 s = o->ptr;
4694 addReplyUlong(c,dictSize(s));
4695 }
4696
4697 static void spopCommand(redisClient *c) {
4698 robj *set;
4699 dictEntry *de;
4700
4701 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4702 checkType(c,set,REDIS_SET)) return;
4703
4704 de = dictGetRandomKey(set->ptr);
4705 if (de == NULL) {
4706 addReply(c,shared.nullbulk);
4707 } else {
4708 robj *ele = dictGetEntryKey(de);
4709
4710 addReplyBulk(c,ele);
4711 dictDelete(set->ptr,ele);
4712 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4713 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4714 server.dirty++;
4715 }
4716 }
4717
4718 static void srandmemberCommand(redisClient *c) {
4719 robj *set;
4720 dictEntry *de;
4721
4722 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4723 checkType(c,set,REDIS_SET)) return;
4724
4725 de = dictGetRandomKey(set->ptr);
4726 if (de == NULL) {
4727 addReply(c,shared.nullbulk);
4728 } else {
4729 robj *ele = dictGetEntryKey(de);
4730
4731 addReplyBulk(c,ele);
4732 }
4733 }
4734
4735 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4736 dict **d1 = (void*) s1, **d2 = (void*) s2;
4737
4738 return dictSize(*d1)-dictSize(*d2);
4739 }
4740
4741 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4742 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4743 dictIterator *di;
4744 dictEntry *de;
4745 robj *lenobj = NULL, *dstset = NULL;
4746 unsigned long j, cardinality = 0;
4747
4748 for (j = 0; j < setsnum; j++) {
4749 robj *setobj;
4750
4751 setobj = dstkey ?
4752 lookupKeyWrite(c->db,setskeys[j]) :
4753 lookupKeyRead(c->db,setskeys[j]);
4754 if (!setobj) {
4755 zfree(dv);
4756 if (dstkey) {
4757 if (deleteKey(c->db,dstkey))
4758 server.dirty++;
4759 addReply(c,shared.czero);
4760 } else {
4761 addReply(c,shared.nullmultibulk);
4762 }
4763 return;
4764 }
4765 if (setobj->type != REDIS_SET) {
4766 zfree(dv);
4767 addReply(c,shared.wrongtypeerr);
4768 return;
4769 }
4770 dv[j] = setobj->ptr;
4771 }
4772 /* Sort sets from the smallest to largest, this will improve our
4773 * algorithm's performace */
4774 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4775
4776 /* The first thing we should output is the total number of elements...
4777 * since this is a multi-bulk write, but at this stage we don't know
4778 * the intersection set size, so we use a trick, append an empty object
4779 * to the output list and save the pointer to later modify it with the
4780 * right length */
4781 if (!dstkey) {
4782 lenobj = createObject(REDIS_STRING,NULL);
4783 addReply(c,lenobj);
4784 decrRefCount(lenobj);
4785 } else {
4786 /* If we have a target key where to store the resulting set
4787 * create this key with an empty set inside */
4788 dstset = createSetObject();
4789 }
4790
4791 /* Iterate all the elements of the first (smallest) set, and test
4792 * the element against all the other sets, if at least one set does
4793 * not include the element it is discarded */
4794 di = dictGetIterator(dv[0]);
4795
4796 while((de = dictNext(di)) != NULL) {
4797 robj *ele;
4798
4799 for (j = 1; j < setsnum; j++)
4800 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4801 if (j != setsnum)
4802 continue; /* at least one set does not contain the member */
4803 ele = dictGetEntryKey(de);
4804 if (!dstkey) {
4805 addReplyBulk(c,ele);
4806 cardinality++;
4807 } else {
4808 dictAdd(dstset->ptr,ele,NULL);
4809 incrRefCount(ele);
4810 }
4811 }
4812 dictReleaseIterator(di);
4813
4814 if (dstkey) {
4815 /* Store the resulting set into the target, if the intersection
4816 * is not an empty set. */
4817 deleteKey(c->db,dstkey);
4818 if (dictSize((dict*)dstset->ptr) > 0) {
4819 dictAdd(c->db->dict,dstkey,dstset);
4820 incrRefCount(dstkey);
4821 addReplyLong(c,dictSize((dict*)dstset->ptr));
4822 } else {
4823 decrRefCount(dstset);
4824 addReply(c,shared.czero);
4825 }
4826 server.dirty++;
4827 } else {
4828 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4829 }
4830 zfree(dv);
4831 }
4832
4833 static void sinterCommand(redisClient *c) {
4834 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4835 }
4836
4837 static void sinterstoreCommand(redisClient *c) {
4838 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4839 }
4840
4841 #define REDIS_OP_UNION 0
4842 #define REDIS_OP_DIFF 1
4843 #define REDIS_OP_INTER 2
4844
4845 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4846 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4847 dictIterator *di;
4848 dictEntry *de;
4849 robj *dstset = NULL;
4850 int j, cardinality = 0;
4851
4852 for (j = 0; j < setsnum; j++) {
4853 robj *setobj;
4854
4855 setobj = dstkey ?
4856 lookupKeyWrite(c->db,setskeys[j]) :
4857 lookupKeyRead(c->db,setskeys[j]);
4858 if (!setobj) {
4859 dv[j] = NULL;
4860 continue;
4861 }
4862 if (setobj->type != REDIS_SET) {
4863 zfree(dv);
4864 addReply(c,shared.wrongtypeerr);
4865 return;
4866 }
4867 dv[j] = setobj->ptr;
4868 }
4869
4870 /* We need a temp set object to store our union. If the dstkey
4871 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4872 * this set object will be the resulting object to set into the target key*/
4873 dstset = createSetObject();
4874
4875 /* Iterate all the elements of all the sets, add every element a single
4876 * time to the result set */
4877 for (j = 0; j < setsnum; j++) {
4878 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4879 if (!dv[j]) continue; /* non existing keys are like empty sets */
4880
4881 di = dictGetIterator(dv[j]);
4882
4883 while((de = dictNext(di)) != NULL) {
4884 robj *ele;
4885
4886 /* dictAdd will not add the same element multiple times */
4887 ele = dictGetEntryKey(de);
4888 if (op == REDIS_OP_UNION || j == 0) {
4889 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4890 incrRefCount(ele);
4891 cardinality++;
4892 }
4893 } else if (op == REDIS_OP_DIFF) {
4894 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4895 cardinality--;
4896 }
4897 }
4898 }
4899 dictReleaseIterator(di);
4900
4901 /* result set is empty? Exit asap. */
4902 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4903 }
4904
4905 /* Output the content of the resulting set, if not in STORE mode */
4906 if (!dstkey) {
4907 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4908 di = dictGetIterator(dstset->ptr);
4909 while((de = dictNext(di)) != NULL) {
4910 robj *ele;
4911
4912 ele = dictGetEntryKey(de);
4913 addReplyBulk(c,ele);
4914 }
4915 dictReleaseIterator(di);
4916 decrRefCount(dstset);
4917 } else {
4918 /* If we have a target key where to store the resulting set
4919 * create this key with the result set inside */
4920 deleteKey(c->db,dstkey);
4921 if (dictSize((dict*)dstset->ptr) > 0) {
4922 dictAdd(c->db->dict,dstkey,dstset);
4923 incrRefCount(dstkey);
4924 addReplyLong(c,dictSize((dict*)dstset->ptr));
4925 } else {
4926 decrRefCount(dstset);
4927 addReply(c,shared.czero);
4928 }
4929 server.dirty++;
4930 }
4931 zfree(dv);
4932 }
4933
4934 static void sunionCommand(redisClient *c) {
4935 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4936 }
4937
4938 static void sunionstoreCommand(redisClient *c) {
4939 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4940 }
4941
4942 static void sdiffCommand(redisClient *c) {
4943 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4944 }
4945
4946 static void sdiffstoreCommand(redisClient *c) {
4947 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4948 }
4949
4950 /* ==================================== ZSets =============================== */
4951
4952 /* ZSETs are ordered sets using two data structures to hold the same elements
4953 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4954 * data structure.
4955 *
4956 * The elements are added to an hash table mapping Redis objects to scores.
4957 * At the same time the elements are added to a skip list mapping scores
4958 * to Redis objects (so objects are sorted by scores in this "view"). */
4959
4960 /* This skiplist implementation is almost a C translation of the original
4961 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4962 * Alternative to Balanced Trees", modified in three ways:
4963 * a) this implementation allows for repeated values.
4964 * b) the comparison is not just by key (our 'score') but by satellite data.
4965 * c) there is a back pointer, so it's a doubly linked list with the back
4966 * pointers being only at "level 1". This allows to traverse the list
4967 * from tail to head, useful for ZREVRANGE. */
4968
4969 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4970 zskiplistNode *zn = zmalloc(sizeof(*zn));
4971
4972 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4973 if (level > 0)
4974 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
4975 zn->score = score;
4976 zn->obj = obj;
4977 return zn;
4978 }
4979
4980 static zskiplist *zslCreate(void) {
4981 int j;
4982 zskiplist *zsl;
4983
4984 zsl = zmalloc(sizeof(*zsl));
4985 zsl->level = 1;
4986 zsl->length = 0;
4987 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4988 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4989 zsl->header->forward[j] = NULL;
4990
4991 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4992 if (j < ZSKIPLIST_MAXLEVEL-1)
4993 zsl->header->span[j] = 0;
4994 }
4995 zsl->header->backward = NULL;
4996 zsl->tail = NULL;
4997 return zsl;
4998 }
4999
5000 static void zslFreeNode(zskiplistNode *node) {
5001 decrRefCount(node->obj);
5002 zfree(node->forward);
5003 zfree(node->span);
5004 zfree(node);
5005 }
5006
5007 static void zslFree(zskiplist *zsl) {
5008 zskiplistNode *node = zsl->header->forward[0], *next;
5009
5010 zfree(zsl->header->forward);
5011 zfree(zsl->header->span);
5012 zfree(zsl->header);
5013 while(node) {
5014 next = node->forward[0];
5015 zslFreeNode(node);
5016 node = next;
5017 }
5018 zfree(zsl);
5019 }
5020
5021 static int zslRandomLevel(void) {
5022 int level = 1;
5023 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5024 level += 1;
5025 return level;
5026 }
5027
5028 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5029 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5030 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5031 int i, level;
5032
5033 x = zsl->header;
5034 for (i = zsl->level-1; i >= 0; i--) {
5035 /* store rank that is crossed to reach the insert position */
5036 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5037
5038 while (x->forward[i] &&
5039 (x->forward[i]->score < score ||
5040 (x->forward[i]->score == score &&
5041 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5042 rank[i] += i > 0 ? x->span[i-1] : 1;
5043 x = x->forward[i];
5044 }
5045 update[i] = x;
5046 }
5047 /* we assume the key is not already inside, since we allow duplicated
5048 * scores, and the re-insertion of score and redis object should never
5049 * happpen since the caller of zslInsert() should test in the hash table
5050 * if the element is already inside or not. */
5051 level = zslRandomLevel();
5052 if (level > zsl->level) {
5053 for (i = zsl->level; i < level; i++) {
5054 rank[i] = 0;
5055 update[i] = zsl->header;
5056 update[i]->span[i-1] = zsl->length;
5057 }
5058 zsl->level = level;
5059 }
5060 x = zslCreateNode(level,score,obj);
5061 for (i = 0; i < level; i++) {
5062 x->forward[i] = update[i]->forward[i];
5063 update[i]->forward[i] = x;
5064
5065 /* update span covered by update[i] as x is inserted here */
5066 if (i > 0) {
5067 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5068 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5069 }
5070 }
5071
5072 /* increment span for untouched levels */
5073 for (i = level; i < zsl->level; i++) {
5074 update[i]->span[i-1]++;
5075 }
5076
5077 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5078 if (x->forward[0])
5079 x->forward[0]->backward = x;
5080 else
5081 zsl->tail = x;
5082 zsl->length++;
5083 }
5084
5085 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5086 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5087 int i;
5088 for (i = 0; i < zsl->level; i++) {
5089 if (update[i]->forward[i] == x) {
5090 if (i > 0) {
5091 update[i]->span[i-1] += x->span[i-1] - 1;
5092 }
5093 update[i]->forward[i] = x->forward[i];
5094 } else {
5095 /* invariant: i > 0, because update[0]->forward[0]
5096 * is always equal to x */
5097 update[i]->span[i-1] -= 1;
5098 }
5099 }
5100 if (x->forward[0]) {
5101 x->forward[0]->backward = x->backward;
5102 } else {
5103 zsl->tail = x->backward;
5104 }
5105 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5106 zsl->level--;
5107 zsl->length--;
5108 }
5109
5110 /* Delete an element with matching score/object from the skiplist. */
5111 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5112 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5113 int i;
5114
5115 x = zsl->header;
5116 for (i = zsl->level-1; i >= 0; i--) {
5117 while (x->forward[i] &&
5118 (x->forward[i]->score < score ||
5119 (x->forward[i]->score == score &&
5120 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5121 x = x->forward[i];
5122 update[i] = x;
5123 }
5124 /* We may have multiple elements with the same score, what we need
5125 * is to find the element with both the right score and object. */
5126 x = x->forward[0];
5127 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5128 zslDeleteNode(zsl, x, update);
5129 zslFreeNode(x);
5130 return 1;
5131 } else {
5132 return 0; /* not found */
5133 }
5134 return 0; /* not found */
5135 }
5136
5137 /* Delete all the elements with score between min and max from the skiplist.
5138 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5139 * Note that this function takes the reference to the hash table view of the
5140 * sorted set, in order to remove the elements from the hash table too. */
5141 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5142 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5143 unsigned long removed = 0;
5144 int i;
5145
5146 x = zsl->header;
5147 for (i = zsl->level-1; i >= 0; i--) {
5148 while (x->forward[i] && x->forward[i]->score < min)
5149 x = x->forward[i];
5150 update[i] = x;
5151 }
5152 /* We may have multiple elements with the same score, what we need
5153 * is to find the element with both the right score and object. */
5154 x = x->forward[0];
5155 while (x && x->score <= max) {
5156 zskiplistNode *next = x->forward[0];
5157 zslDeleteNode(zsl, x, update);
5158 dictDelete(dict,x->obj);
5159 zslFreeNode(x);
5160 removed++;
5161 x = next;
5162 }
5163 return removed; /* not found */
5164 }
5165
5166 /* Delete all the elements with rank between start and end from the skiplist.
5167 * Start and end are inclusive. Note that start and end need to be 1-based */
5168 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5169 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5170 unsigned long traversed = 0, removed = 0;
5171 int i;
5172
5173 x = zsl->header;
5174 for (i = zsl->level-1; i >= 0; i--) {
5175 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5176 traversed += i > 0 ? x->span[i-1] : 1;
5177 x = x->forward[i];
5178 }
5179 update[i] = x;
5180 }
5181
5182 traversed++;
5183 x = x->forward[0];
5184 while (x && traversed <= end) {
5185 zskiplistNode *next = x->forward[0];
5186 zslDeleteNode(zsl, x, update);
5187 dictDelete(dict,x->obj);
5188 zslFreeNode(x);
5189 removed++;
5190 traversed++;
5191 x = next;
5192 }
5193 return removed;
5194 }
5195
5196 /* Find the first node having a score equal or greater than the specified one.
5197 * Returns NULL if there is no match. */
5198 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5199 zskiplistNode *x;
5200 int i;
5201
5202 x = zsl->header;
5203 for (i = zsl->level-1; i >= 0; i--) {
5204 while (x->forward[i] && x->forward[i]->score < score)
5205 x = x->forward[i];
5206 }
5207 /* We may have multiple elements with the same score, what we need
5208 * is to find the element with both the right score and object. */
5209 return x->forward[0];
5210 }
5211
5212 /* Find the rank for an element by both score and key.
5213 * Returns 0 when the element cannot be found, rank otherwise.
5214 * Note that the rank is 1-based due to the span of zsl->header to the
5215 * first element. */
5216 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5217 zskiplistNode *x;
5218 unsigned long rank = 0;
5219 int i;
5220
5221 x = zsl->header;
5222 for (i = zsl->level-1; i >= 0; i--) {
5223 while (x->forward[i] &&
5224 (x->forward[i]->score < score ||
5225 (x->forward[i]->score == score &&
5226 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5227 rank += i > 0 ? x->span[i-1] : 1;
5228 x = x->forward[i];
5229 }
5230
5231 /* x might be equal to zsl->header, so test if obj is non-NULL */
5232 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5233 return rank;
5234 }
5235 }
5236 return 0;
5237 }
5238
5239 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5240 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5241 zskiplistNode *x;
5242 unsigned long traversed = 0;
5243 int i;
5244
5245 x = zsl->header;
5246 for (i = zsl->level-1; i >= 0; i--) {
5247 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5248 {
5249 traversed += i > 0 ? x->span[i-1] : 1;
5250 x = x->forward[i];
5251 }
5252 if (traversed == rank) {
5253 return x;
5254 }
5255 }
5256 return NULL;
5257 }
5258
5259 /* The actual Z-commands implementations */
5260
5261 /* This generic command implements both ZADD and ZINCRBY.
5262 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5263 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5264 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5265 robj *zsetobj;
5266 zset *zs;
5267 double *score;
5268
5269 zsetobj = lookupKeyWrite(c->db,key);
5270 if (zsetobj == NULL) {
5271 zsetobj = createZsetObject();
5272 dictAdd(c->db->dict,key,zsetobj);
5273 incrRefCount(key);
5274 } else {
5275 if (zsetobj->type != REDIS_ZSET) {
5276 addReply(c,shared.wrongtypeerr);
5277 return;
5278 }
5279 }
5280 zs = zsetobj->ptr;
5281
5282 /* Ok now since we implement both ZADD and ZINCRBY here the code
5283 * needs to handle the two different conditions. It's all about setting
5284 * '*score', that is, the new score to set, to the right value. */
5285 score = zmalloc(sizeof(double));
5286 if (doincrement) {
5287 dictEntry *de;
5288
5289 /* Read the old score. If the element was not present starts from 0 */
5290 de = dictFind(zs->dict,ele);
5291 if (de) {
5292 double *oldscore = dictGetEntryVal(de);
5293 *score = *oldscore + scoreval;
5294 } else {
5295 *score = scoreval;
5296 }
5297 } else {
5298 *score = scoreval;
5299 }
5300
5301 /* What follows is a simple remove and re-insert operation that is common
5302 * to both ZADD and ZINCRBY... */
5303 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5304 /* case 1: New element */
5305 incrRefCount(ele); /* added to hash */
5306 zslInsert(zs->zsl,*score,ele);
5307 incrRefCount(ele); /* added to skiplist */
5308 server.dirty++;
5309 if (doincrement)
5310 addReplyDouble(c,*score);
5311 else
5312 addReply(c,shared.cone);
5313 } else {
5314 dictEntry *de;
5315 double *oldscore;
5316
5317 /* case 2: Score update operation */
5318 de = dictFind(zs->dict,ele);
5319 redisAssert(de != NULL);
5320 oldscore = dictGetEntryVal(de);
5321 if (*score != *oldscore) {
5322 int deleted;
5323
5324 /* Remove and insert the element in the skip list with new score */
5325 deleted = zslDelete(zs->zsl,*oldscore,ele);
5326 redisAssert(deleted != 0);
5327 zslInsert(zs->zsl,*score,ele);
5328 incrRefCount(ele);
5329 /* Update the score in the hash table */
5330 dictReplace(zs->dict,ele,score);
5331 server.dirty++;
5332 } else {
5333 zfree(score);
5334 }
5335 if (doincrement)
5336 addReplyDouble(c,*score);
5337 else
5338 addReply(c,shared.czero);
5339 }
5340 }
5341
5342 static void zaddCommand(redisClient *c) {
5343 double scoreval;
5344
5345 scoreval = strtod(c->argv[2]->ptr,NULL);
5346 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5347 }
5348
5349 static void zincrbyCommand(redisClient *c) {
5350 double scoreval;
5351
5352 scoreval = strtod(c->argv[2]->ptr,NULL);
5353 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5354 }
5355
5356 static void zremCommand(redisClient *c) {
5357 robj *zsetobj;
5358 zset *zs;
5359 dictEntry *de;
5360 double *oldscore;
5361 int deleted;
5362
5363 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5364 checkType(c,zsetobj,REDIS_ZSET)) return;
5365
5366 zs = zsetobj->ptr;
5367 de = dictFind(zs->dict,c->argv[2]);
5368 if (de == NULL) {
5369 addReply(c,shared.czero);
5370 return;
5371 }
5372 /* Delete from the skiplist */
5373 oldscore = dictGetEntryVal(de);
5374 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5375 redisAssert(deleted != 0);
5376
5377 /* Delete from the hash table */
5378 dictDelete(zs->dict,c->argv[2]);
5379 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5380 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5381 server.dirty++;
5382 addReply(c,shared.cone);
5383 }
5384
5385 static void zremrangebyscoreCommand(redisClient *c) {
5386 double min = strtod(c->argv[2]->ptr,NULL);
5387 double max = strtod(c->argv[3]->ptr,NULL);
5388 long deleted;
5389 robj *zsetobj;
5390 zset *zs;
5391
5392 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5393 checkType(c,zsetobj,REDIS_ZSET)) return;
5394
5395 zs = zsetobj->ptr;
5396 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5397 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5398 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5399 server.dirty += deleted;
5400 addReplyLong(c,deleted);
5401 }
5402
5403 static void zremrangebyrankCommand(redisClient *c) {
5404 int start = atoi(c->argv[2]->ptr);
5405 int end = atoi(c->argv[3]->ptr);
5406 int llen;
5407 long deleted;
5408 robj *zsetobj;
5409 zset *zs;
5410
5411 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5412 checkType(c,zsetobj,REDIS_ZSET)) return;
5413 zs = zsetobj->ptr;
5414 llen = zs->zsl->length;
5415
5416 /* convert negative indexes */
5417 if (start < 0) start = llen+start;
5418 if (end < 0) end = llen+end;
5419 if (start < 0) start = 0;
5420 if (end < 0) end = 0;
5421
5422 /* indexes sanity checks */
5423 if (start > end || start >= llen) {
5424 addReply(c,shared.czero);
5425 return;
5426 }
5427 if (end >= llen) end = llen-1;
5428
5429 /* increment start and end because zsl*Rank functions
5430 * use 1-based rank */
5431 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5432 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5433 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5434 server.dirty += deleted;
5435 addReplyLong(c, deleted);
5436 }
5437
5438 typedef struct {
5439 dict *dict;
5440 double weight;
5441 } zsetopsrc;
5442
5443 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5444 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5445 unsigned long size1, size2;
5446 size1 = d1->dict ? dictSize(d1->dict) : 0;
5447 size2 = d2->dict ? dictSize(d2->dict) : 0;
5448 return size1 - size2;
5449 }
5450
5451 #define REDIS_AGGR_SUM 1
5452 #define REDIS_AGGR_MIN 2
5453 #define REDIS_AGGR_MAX 3
5454
5455 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5456 if (aggregate == REDIS_AGGR_SUM) {
5457 *target = *target + val;
5458 } else if (aggregate == REDIS_AGGR_MIN) {
5459 *target = val < *target ? val : *target;
5460 } else if (aggregate == REDIS_AGGR_MAX) {
5461 *target = val > *target ? val : *target;
5462 } else {
5463 /* safety net */
5464 redisAssert(0 != 0);
5465 }
5466 }
5467
5468 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5469 int i, j, zsetnum;
5470 int aggregate = REDIS_AGGR_SUM;
5471 zsetopsrc *src;
5472 robj *dstobj;
5473 zset *dstzset;
5474 dictIterator *di;
5475 dictEntry *de;
5476
5477 /* expect zsetnum input keys to be given */
5478 zsetnum = atoi(c->argv[2]->ptr);
5479 if (zsetnum < 1) {
5480 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5481 return;
5482 }
5483
5484 /* test if the expected number of keys would overflow */
5485 if (3+zsetnum > c->argc) {
5486 addReply(c,shared.syntaxerr);
5487 return;
5488 }
5489
5490 /* read keys to be used for input */
5491 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5492 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5493 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5494 if (!zsetobj) {
5495 src[i].dict = NULL;
5496 } else {
5497 if (zsetobj->type != REDIS_ZSET) {
5498 zfree(src);
5499 addReply(c,shared.wrongtypeerr);
5500 return;
5501 }
5502 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5503 }
5504
5505 /* default all weights to 1 */
5506 src[i].weight = 1.0;
5507 }
5508
5509 /* parse optional extra arguments */
5510 if (j < c->argc) {
5511 int remaining = c->argc - j;
5512
5513 while (remaining) {
5514 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5515 j++; remaining--;
5516 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5517 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5518 }
5519 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5520 j++; remaining--;
5521 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5522 aggregate = REDIS_AGGR_SUM;
5523 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5524 aggregate = REDIS_AGGR_MIN;
5525 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5526 aggregate = REDIS_AGGR_MAX;
5527 } else {
5528 zfree(src);
5529 addReply(c,shared.syntaxerr);
5530 return;
5531 }
5532 j++; remaining--;
5533 } else {
5534 zfree(src);
5535 addReply(c,shared.syntaxerr);
5536 return;
5537 }
5538 }
5539 }
5540
5541 /* sort sets from the smallest to largest, this will improve our
5542 * algorithm's performance */
5543 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5544
5545 dstobj = createZsetObject();
5546 dstzset = dstobj->ptr;
5547
5548 if (op == REDIS_OP_INTER) {
5549 /* skip going over all entries if the smallest zset is NULL or empty */
5550 if (src[0].dict && dictSize(src[0].dict) > 0) {
5551 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5552 * from small to large, all src[i > 0].dict are non-empty too */
5553 di = dictGetIterator(src[0].dict);
5554 while((de = dictNext(di)) != NULL) {
5555 double *score = zmalloc(sizeof(double)), value;
5556 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5557
5558 for (j = 1; j < zsetnum; j++) {
5559 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5560 if (other) {
5561 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5562 zunionInterAggregate(score, value, aggregate);
5563 } else {
5564 break;
5565 }
5566 }
5567
5568 /* skip entry when not present in every source dict */
5569 if (j != zsetnum) {
5570 zfree(score);
5571 } else {
5572 robj *o = dictGetEntryKey(de);
5573 dictAdd(dstzset->dict,o,score);
5574 incrRefCount(o); /* added to dictionary */
5575 zslInsert(dstzset->zsl,*score,o);
5576 incrRefCount(o); /* added to skiplist */
5577 }
5578 }
5579 dictReleaseIterator(di);
5580 }
5581 } else if (op == REDIS_OP_UNION) {
5582 for (i = 0; i < zsetnum; i++) {
5583 if (!src[i].dict) continue;
5584
5585 di = dictGetIterator(src[i].dict);
5586 while((de = dictNext(di)) != NULL) {
5587 /* skip key when already processed */
5588 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5589
5590 double *score = zmalloc(sizeof(double)), value;
5591 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5592
5593 /* because the zsets are sorted by size, its only possible
5594 * for sets at larger indices to hold this entry */
5595 for (j = (i+1); j < zsetnum; j++) {
5596 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5597 if (other) {
5598 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5599 zunionInterAggregate(score, value, aggregate);
5600 }
5601 }
5602
5603 robj *o = dictGetEntryKey(de);
5604 dictAdd(dstzset->dict,o,score);
5605 incrRefCount(o); /* added to dictionary */
5606 zslInsert(dstzset->zsl,*score,o);
5607 incrRefCount(o); /* added to skiplist */
5608 }
5609 dictReleaseIterator(di);
5610 }
5611 } else {
5612 /* unknown operator */
5613 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5614 }
5615
5616 deleteKey(c->db,dstkey);
5617 if (dstzset->zsl->length) {
5618 dictAdd(c->db->dict,dstkey,dstobj);
5619 incrRefCount(dstkey);
5620 addReplyLong(c, dstzset->zsl->length);
5621 server.dirty++;
5622 } else {
5623 decrRefCount(dstzset);
5624 addReply(c, shared.czero);
5625 }
5626 zfree(src);
5627 }
5628
5629 static void zunionCommand(redisClient *c) {
5630 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5631 }
5632
5633 static void zinterCommand(redisClient *c) {
5634 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5635 }
5636
5637 static void zrangeGenericCommand(redisClient *c, int reverse) {
5638 robj *o;
5639 int start = atoi(c->argv[2]->ptr);
5640 int end = atoi(c->argv[3]->ptr);
5641 int withscores = 0;
5642 int llen;
5643 int rangelen, j;
5644 zset *zsetobj;
5645 zskiplist *zsl;
5646 zskiplistNode *ln;
5647 robj *ele;
5648
5649 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5650 withscores = 1;
5651 } else if (c->argc >= 5) {
5652 addReply(c,shared.syntaxerr);
5653 return;
5654 }
5655
5656 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5657 checkType(c,o,REDIS_ZSET)) return;
5658 zsetobj = o->ptr;
5659 zsl = zsetobj->zsl;
5660 llen = zsl->length;
5661
5662 /* convert negative indexes */
5663 if (start < 0) start = llen+start;
5664 if (end < 0) end = llen+end;
5665 if (start < 0) start = 0;
5666 if (end < 0) end = 0;
5667
5668 /* indexes sanity checks */
5669 if (start > end || start >= llen) {
5670 /* Out of range start or start > end result in empty list */
5671 addReply(c,shared.emptymultibulk);
5672 return;
5673 }
5674 if (end >= llen) end = llen-1;
5675 rangelen = (end-start)+1;
5676
5677 /* check if starting point is trivial, before searching
5678 * the element in log(N) time */
5679 if (reverse) {
5680 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5681 } else {
5682 ln = start == 0 ?
5683 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5684 }
5685
5686 /* Return the result in form of a multi-bulk reply */
5687 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5688 withscores ? (rangelen*2) : rangelen));
5689 for (j = 0; j < rangelen; j++) {
5690 ele = ln->obj;
5691 addReplyBulk(c,ele);
5692 if (withscores)
5693 addReplyDouble(c,ln->score);
5694 ln = reverse ? ln->backward : ln->forward[0];
5695 }
5696 }
5697
5698 static void zrangeCommand(redisClient *c) {
5699 zrangeGenericCommand(c,0);
5700 }
5701
5702 static void zrevrangeCommand(redisClient *c) {
5703 zrangeGenericCommand(c,1);
5704 }
5705
5706 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5707 * If justcount is non-zero, just the count is returned. */
5708 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5709 robj *o;
5710 double min, max;
5711 int minex = 0, maxex = 0; /* are min or max exclusive? */
5712 int offset = 0, limit = -1;
5713 int withscores = 0;
5714 int badsyntax = 0;
5715
5716 /* Parse the min-max interval. If one of the values is prefixed
5717 * by the "(" character, it's considered "open". For instance
5718 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5719 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5720 if (((char*)c->argv[2]->ptr)[0] == '(') {
5721 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5722 minex = 1;
5723 } else {
5724 min = strtod(c->argv[2]->ptr,NULL);
5725 }
5726 if (((char*)c->argv[3]->ptr)[0] == '(') {
5727 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5728 maxex = 1;
5729 } else {
5730 max = strtod(c->argv[3]->ptr,NULL);
5731 }
5732
5733 /* Parse "WITHSCORES": note that if the command was called with
5734 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5735 * enter the following paths to parse WITHSCORES and LIMIT. */
5736 if (c->argc == 5 || c->argc == 8) {
5737 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5738 withscores = 1;
5739 else
5740 badsyntax = 1;
5741 }
5742 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5743 badsyntax = 1;
5744 if (badsyntax) {
5745 addReplySds(c,
5746 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5747 return;
5748 }
5749
5750 /* Parse "LIMIT" */
5751 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5752 addReply(c,shared.syntaxerr);
5753 return;
5754 } else if (c->argc == (7 + withscores)) {
5755 offset = atoi(c->argv[5]->ptr);
5756 limit = atoi(c->argv[6]->ptr);
5757 if (offset < 0) offset = 0;
5758 }
5759
5760 /* Ok, lookup the key and get the range */
5761 o = lookupKeyRead(c->db,c->argv[1]);
5762 if (o == NULL) {
5763 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5764 } else {
5765 if (o->type != REDIS_ZSET) {
5766 addReply(c,shared.wrongtypeerr);
5767 } else {
5768 zset *zsetobj = o->ptr;
5769 zskiplist *zsl = zsetobj->zsl;
5770 zskiplistNode *ln;
5771 robj *ele, *lenobj = NULL;
5772 unsigned long rangelen = 0;
5773
5774 /* Get the first node with the score >= min, or with
5775 * score > min if 'minex' is true. */
5776 ln = zslFirstWithScore(zsl,min);
5777 while (minex && ln && ln->score == min) ln = ln->forward[0];
5778
5779 if (ln == NULL) {
5780 /* No element matching the speciifed interval */
5781 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5782 return;
5783 }
5784
5785 /* We don't know in advance how many matching elements there
5786 * are in the list, so we push this object that will represent
5787 * the multi-bulk length in the output buffer, and will "fix"
5788 * it later */
5789 if (!justcount) {
5790 lenobj = createObject(REDIS_STRING,NULL);
5791 addReply(c,lenobj);
5792 decrRefCount(lenobj);
5793 }
5794
5795 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5796 if (offset) {
5797 offset--;
5798 ln = ln->forward[0];
5799 continue;
5800 }
5801 if (limit == 0) break;
5802 if (!justcount) {
5803 ele = ln->obj;
5804 addReplyBulk(c,ele);
5805 if (withscores)
5806 addReplyDouble(c,ln->score);
5807 }
5808 ln = ln->forward[0];
5809 rangelen++;
5810 if (limit > 0) limit--;
5811 }
5812 if (justcount) {
5813 addReplyLong(c,(long)rangelen);
5814 } else {
5815 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5816 withscores ? (rangelen*2) : rangelen);
5817 }
5818 }
5819 }
5820 }
5821
5822 static void zrangebyscoreCommand(redisClient *c) {
5823 genericZrangebyscoreCommand(c,0);
5824 }
5825
5826 static void zcountCommand(redisClient *c) {
5827 genericZrangebyscoreCommand(c,1);
5828 }
5829
5830 static void zcardCommand(redisClient *c) {
5831 robj *o;
5832 zset *zs;
5833
5834 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5835 checkType(c,o,REDIS_ZSET)) return;
5836
5837 zs = o->ptr;
5838 addReplyUlong(c,zs->zsl->length);
5839 }
5840
5841 static void zscoreCommand(redisClient *c) {
5842 robj *o;
5843 zset *zs;
5844 dictEntry *de;
5845
5846 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5847 checkType(c,o,REDIS_ZSET)) return;
5848
5849 zs = o->ptr;
5850 de = dictFind(zs->dict,c->argv[2]);
5851 if (!de) {
5852 addReply(c,shared.nullbulk);
5853 } else {
5854 double *score = dictGetEntryVal(de);
5855
5856 addReplyDouble(c,*score);
5857 }
5858 }
5859
5860 static void zrankGenericCommand(redisClient *c, int reverse) {
5861 robj *o;
5862 zset *zs;
5863 zskiplist *zsl;
5864 dictEntry *de;
5865 unsigned long rank;
5866 double *score;
5867
5868 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5869 checkType(c,o,REDIS_ZSET)) return;
5870
5871 zs = o->ptr;
5872 zsl = zs->zsl;
5873 de = dictFind(zs->dict,c->argv[2]);
5874 if (!de) {
5875 addReply(c,shared.nullbulk);
5876 return;
5877 }
5878
5879 score = dictGetEntryVal(de);
5880 rank = zslGetRank(zsl, *score, c->argv[2]);
5881 if (rank) {
5882 if (reverse) {
5883 addReplyLong(c, zsl->length - rank);
5884 } else {
5885 addReplyLong(c, rank-1);
5886 }
5887 } else {
5888 addReply(c,shared.nullbulk);
5889 }
5890 }
5891
5892 static void zrankCommand(redisClient *c) {
5893 zrankGenericCommand(c, 0);
5894 }
5895
5896 static void zrevrankCommand(redisClient *c) {
5897 zrankGenericCommand(c, 1);
5898 }
5899
5900 /* =================================== Hashes =============================== */
5901 static void hsetCommand(redisClient *c) {
5902 int update = 0;
5903 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5904
5905 if (o == NULL) {
5906 o = createHashObject();
5907 dictAdd(c->db->dict,c->argv[1],o);
5908 incrRefCount(c->argv[1]);
5909 } else {
5910 if (o->type != REDIS_HASH) {
5911 addReply(c,shared.wrongtypeerr);
5912 return;
5913 }
5914 }
5915 /* We want to convert the zipmap into an hash table right now if the
5916 * entry to be added is too big. Note that we check if the object
5917 * is integer encoded before to try fetching the length in the test below.
5918 * This is because integers are small, but currently stringObjectLen()
5919 * performs a slow conversion: not worth it. */
5920 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5921 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5922 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5923 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5924 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5925 {
5926 convertToRealHash(o);
5927 }
5928
5929 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5930 unsigned char *zm = o->ptr;
5931 robj *valobj = getDecodedObject(c->argv[3]);
5932
5933 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5934 valobj->ptr,sdslen(valobj->ptr),&update);
5935 decrRefCount(valobj);
5936 o->ptr = zm;
5937
5938 /* And here there is the second check for hash conversion...
5939 * we want to do it only if the operation was not just an update as
5940 * zipmapLen() is O(N). */
5941 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
5942 convertToRealHash(o);
5943 } else {
5944 tryObjectEncoding(c->argv[2]);
5945 /* note that c->argv[3] is already encoded, as the latest arg
5946 * of a bulk command is always integer encoded if possible. */
5947 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5948 incrRefCount(c->argv[2]);
5949 } else {
5950 update = 1;
5951 }
5952 incrRefCount(c->argv[3]);
5953 }
5954 server.dirty++;
5955 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5956 }
5957
5958 static void hgetCommand(redisClient *c) {
5959 robj *o;
5960
5961 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5962 checkType(c,o,REDIS_HASH)) return;
5963
5964 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5965 unsigned char *zm = o->ptr;
5966 unsigned char *val;
5967 unsigned int vlen;
5968 robj *field;
5969
5970 field = getDecodedObject(c->argv[2]);
5971 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
5972 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
5973 addReplySds(c,sdsnewlen(val,vlen));
5974 addReply(c,shared.crlf);
5975 decrRefCount(field);
5976 return;
5977 } else {
5978 addReply(c,shared.nullbulk);
5979 decrRefCount(field);
5980 return;
5981 }
5982 } else {
5983 struct dictEntry *de;
5984
5985 de = dictFind(o->ptr,c->argv[2]);
5986 if (de == NULL) {
5987 addReply(c,shared.nullbulk);
5988 } else {
5989 robj *e = dictGetEntryVal(de);
5990
5991 addReplyBulk(c,e);
5992 }
5993 }
5994 }
5995
5996 static void hdelCommand(redisClient *c) {
5997 robj *o;
5998 int deleted = 0;
5999
6000 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6001 checkType(c,o,REDIS_HASH)) return;
6002
6003 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6004 robj *field = getDecodedObject(c->argv[2]);
6005
6006 o->ptr = zipmapDel((unsigned char*) o->ptr,
6007 (unsigned char*) field->ptr,
6008 sdslen(field->ptr), &deleted);
6009 decrRefCount(field);
6010 if (zipmapLen((unsigned char*) o->ptr) == 0)
6011 deleteKey(c->db,c->argv[1]);
6012 } else {
6013 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6014 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6015 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6016 }
6017 if (deleted) server.dirty++;
6018 addReply(c,deleted ? shared.cone : shared.czero);
6019 }
6020
6021 static void hlenCommand(redisClient *c) {
6022 robj *o;
6023 unsigned long len;
6024
6025 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6026 checkType(c,o,REDIS_HASH)) return;
6027
6028 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6029 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6030 addReplyUlong(c,len);
6031 }
6032
6033 #define REDIS_GETALL_KEYS 1
6034 #define REDIS_GETALL_VALS 2
6035 static void genericHgetallCommand(redisClient *c, int flags) {
6036 robj *o, *lenobj;
6037 unsigned long count = 0;
6038
6039 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6040 || checkType(c,o,REDIS_HASH)) return;
6041
6042 lenobj = createObject(REDIS_STRING,NULL);
6043 addReply(c,lenobj);
6044 decrRefCount(lenobj);
6045
6046 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6047 unsigned char *p = zipmapRewind(o->ptr);
6048 unsigned char *field, *val;
6049 unsigned int flen, vlen;
6050
6051 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6052 robj *aux;
6053
6054 if (flags & REDIS_GETALL_KEYS) {
6055 aux = createStringObject((char*)field,flen);
6056 addReplyBulk(c,aux);
6057 decrRefCount(aux);
6058 count++;
6059 }
6060 if (flags & REDIS_GETALL_VALS) {
6061 aux = createStringObject((char*)val,vlen);
6062 addReplyBulk(c,aux);
6063 decrRefCount(aux);
6064 count++;
6065 }
6066 }
6067 } else {
6068 dictIterator *di = dictGetIterator(o->ptr);
6069 dictEntry *de;
6070
6071 while((de = dictNext(di)) != NULL) {
6072 robj *fieldobj = dictGetEntryKey(de);
6073 robj *valobj = dictGetEntryVal(de);
6074
6075 if (flags & REDIS_GETALL_KEYS) {
6076 addReplyBulk(c,fieldobj);
6077 count++;
6078 }
6079 if (flags & REDIS_GETALL_VALS) {
6080 addReplyBulk(c,valobj);
6081 count++;
6082 }
6083 }
6084 dictReleaseIterator(di);
6085 }
6086 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6087 }
6088
6089 static void hkeysCommand(redisClient *c) {
6090 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6091 }
6092
6093 static void hvalsCommand(redisClient *c) {
6094 genericHgetallCommand(c,REDIS_GETALL_VALS);
6095 }
6096
6097 static void hgetallCommand(redisClient *c) {
6098 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6099 }
6100
6101 static void hexistsCommand(redisClient *c) {
6102 robj *o;
6103 int exists = 0;
6104
6105 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6106 checkType(c,o,REDIS_HASH)) return;
6107
6108 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6109 robj *field;
6110 unsigned char *zm = o->ptr;
6111
6112 field = getDecodedObject(c->argv[2]);
6113 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6114 decrRefCount(field);
6115 } else {
6116 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6117 }
6118 addReply(c,exists ? shared.cone : shared.czero);
6119 }
6120
6121 static void convertToRealHash(robj *o) {
6122 unsigned char *key, *val, *p, *zm = o->ptr;
6123 unsigned int klen, vlen;
6124 dict *dict = dictCreate(&hashDictType,NULL);
6125
6126 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6127 p = zipmapRewind(zm);
6128 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6129 robj *keyobj, *valobj;
6130
6131 keyobj = createStringObject((char*)key,klen);
6132 valobj = createStringObject((char*)val,vlen);
6133 tryObjectEncoding(keyobj);
6134 tryObjectEncoding(valobj);
6135 dictAdd(dict,keyobj,valobj);
6136 }
6137 o->encoding = REDIS_ENCODING_HT;
6138 o->ptr = dict;
6139 zfree(zm);
6140 }
6141
6142 /* ========================= Non type-specific commands ==================== */
6143
6144 static void flushdbCommand(redisClient *c) {
6145 server.dirty += dictSize(c->db->dict);
6146 dictEmpty(c->db->dict);
6147 dictEmpty(c->db->expires);
6148 addReply(c,shared.ok);
6149 }
6150
6151 static void flushallCommand(redisClient *c) {
6152 server.dirty += emptyDb();
6153 addReply(c,shared.ok);
6154 if (server.bgsavechildpid != -1) {
6155 kill(server.bgsavechildpid,SIGKILL);
6156 rdbRemoveTempFile(server.bgsavechildpid);
6157 }
6158 rdbSave(server.dbfilename);
6159 server.dirty++;
6160 }
6161
6162 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6163 redisSortOperation *so = zmalloc(sizeof(*so));
6164 so->type = type;
6165 so->pattern = pattern;
6166 return so;
6167 }
6168
6169 /* Return the value associated to the key with a name obtained
6170 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6171 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6172 char *p;
6173 sds spat, ssub;
6174 robj keyobj;
6175 int prefixlen, sublen, postfixlen;
6176 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6177 struct {
6178 long len;
6179 long free;
6180 char buf[REDIS_SORTKEY_MAX+1];
6181 } keyname;
6182
6183 /* If the pattern is "#" return the substitution object itself in order
6184 * to implement the "SORT ... GET #" feature. */
6185 spat = pattern->ptr;
6186 if (spat[0] == '#' && spat[1] == '\0') {
6187 return subst;
6188 }
6189
6190 /* The substitution object may be specially encoded. If so we create
6191 * a decoded object on the fly. Otherwise getDecodedObject will just
6192 * increment the ref count, that we'll decrement later. */
6193 subst = getDecodedObject(subst);
6194
6195 ssub = subst->ptr;
6196 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6197 p = strchr(spat,'*');
6198 if (!p) {
6199 decrRefCount(subst);
6200 return NULL;
6201 }
6202
6203 prefixlen = p-spat;
6204 sublen = sdslen(ssub);
6205 postfixlen = sdslen(spat)-(prefixlen+1);
6206 memcpy(keyname.buf,spat,prefixlen);
6207 memcpy(keyname.buf+prefixlen,ssub,sublen);
6208 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6209 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6210 keyname.len = prefixlen+sublen+postfixlen;
6211
6212 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6213 decrRefCount(subst);
6214
6215 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6216 return lookupKeyRead(db,&keyobj);
6217 }
6218
6219 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6220 * the additional parameter is not standard but a BSD-specific we have to
6221 * pass sorting parameters via the global 'server' structure */
6222 static int sortCompare(const void *s1, const void *s2) {
6223 const redisSortObject *so1 = s1, *so2 = s2;
6224 int cmp;
6225
6226 if (!server.sort_alpha) {
6227 /* Numeric sorting. Here it's trivial as we precomputed scores */
6228 if (so1->u.score > so2->u.score) {
6229 cmp = 1;
6230 } else if (so1->u.score < so2->u.score) {
6231 cmp = -1;
6232 } else {
6233 cmp = 0;
6234 }
6235 } else {
6236 /* Alphanumeric sorting */
6237 if (server.sort_bypattern) {
6238 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6239 /* At least one compare object is NULL */
6240 if (so1->u.cmpobj == so2->u.cmpobj)
6241 cmp = 0;
6242 else if (so1->u.cmpobj == NULL)
6243 cmp = -1;
6244 else
6245 cmp = 1;
6246 } else {
6247 /* We have both the objects, use strcoll */
6248 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6249 }
6250 } else {
6251 /* Compare elements directly */
6252 robj *dec1, *dec2;
6253
6254 dec1 = getDecodedObject(so1->obj);
6255 dec2 = getDecodedObject(so2->obj);
6256 cmp = strcoll(dec1->ptr,dec2->ptr);
6257 decrRefCount(dec1);
6258 decrRefCount(dec2);
6259 }
6260 }
6261 return server.sort_desc ? -cmp : cmp;
6262 }
6263
6264 /* The SORT command is the most complex command in Redis. Warning: this code
6265 * is optimized for speed and a bit less for readability */
6266 static void sortCommand(redisClient *c) {
6267 list *operations;
6268 int outputlen = 0;
6269 int desc = 0, alpha = 0;
6270 int limit_start = 0, limit_count = -1, start, end;
6271 int j, dontsort = 0, vectorlen;
6272 int getop = 0; /* GET operation counter */
6273 robj *sortval, *sortby = NULL, *storekey = NULL;
6274 redisSortObject *vector; /* Resulting vector to sort */
6275
6276 /* Lookup the key to sort. It must be of the right types */
6277 sortval = lookupKeyRead(c->db,c->argv[1]);
6278 if (sortval == NULL) {
6279 addReply(c,shared.nullmultibulk);
6280 return;
6281 }
6282 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6283 sortval->type != REDIS_ZSET)
6284 {
6285 addReply(c,shared.wrongtypeerr);
6286 return;
6287 }
6288
6289 /* Create a list of operations to perform for every sorted element.
6290 * Operations can be GET/DEL/INCR/DECR */
6291 operations = listCreate();
6292 listSetFreeMethod(operations,zfree);
6293 j = 2;
6294
6295 /* Now we need to protect sortval incrementing its count, in the future
6296 * SORT may have options able to overwrite/delete keys during the sorting
6297 * and the sorted key itself may get destroied */
6298 incrRefCount(sortval);
6299
6300 /* The SORT command has an SQL-alike syntax, parse it */
6301 while(j < c->argc) {
6302 int leftargs = c->argc-j-1;
6303 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6304 desc = 0;
6305 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6306 desc = 1;
6307 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6308 alpha = 1;
6309 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6310 limit_start = atoi(c->argv[j+1]->ptr);
6311 limit_count = atoi(c->argv[j+2]->ptr);
6312 j+=2;
6313 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6314 storekey = c->argv[j+1];
6315 j++;
6316 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6317 sortby = c->argv[j+1];
6318 /* If the BY pattern does not contain '*', i.e. it is constant,
6319 * we don't need to sort nor to lookup the weight keys. */
6320 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6321 j++;
6322 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6323 listAddNodeTail(operations,createSortOperation(
6324 REDIS_SORT_GET,c->argv[j+1]));
6325 getop++;
6326 j++;
6327 } else {
6328 decrRefCount(sortval);
6329 listRelease(operations);
6330 addReply(c,shared.syntaxerr);
6331 return;
6332 }
6333 j++;
6334 }
6335
6336 /* Load the sorting vector with all the objects to sort */
6337 switch(sortval->type) {
6338 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6339 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6340 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6341 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6342 }
6343 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6344 j = 0;
6345
6346 if (sortval->type == REDIS_LIST) {
6347 list *list = sortval->ptr;
6348 listNode *ln;
6349 listIter li;
6350
6351 listRewind(list,&li);
6352 while((ln = listNext(&li))) {
6353 robj *ele = ln->value;
6354 vector[j].obj = ele;
6355 vector[j].u.score = 0;
6356 vector[j].u.cmpobj = NULL;
6357 j++;
6358 }
6359 } else {
6360 dict *set;
6361 dictIterator *di;
6362 dictEntry *setele;
6363
6364 if (sortval->type == REDIS_SET) {
6365 set = sortval->ptr;
6366 } else {
6367 zset *zs = sortval->ptr;
6368 set = zs->dict;
6369 }
6370
6371 di = dictGetIterator(set);
6372 while((setele = dictNext(di)) != NULL) {
6373 vector[j].obj = dictGetEntryKey(setele);
6374 vector[j].u.score = 0;
6375 vector[j].u.cmpobj = NULL;
6376 j++;
6377 }
6378 dictReleaseIterator(di);
6379 }
6380 redisAssert(j == vectorlen);
6381
6382 /* Now it's time to load the right scores in the sorting vector */
6383 if (dontsort == 0) {
6384 for (j = 0; j < vectorlen; j++) {
6385 if (sortby) {
6386 robj *byval;
6387
6388 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6389 if (!byval || byval->type != REDIS_STRING) continue;
6390 if (alpha) {
6391 vector[j].u.cmpobj = getDecodedObject(byval);
6392 } else {
6393 if (byval->encoding == REDIS_ENCODING_RAW) {
6394 vector[j].u.score = strtod(byval->ptr,NULL);
6395 } else {
6396 /* Don't need to decode the object if it's
6397 * integer-encoded (the only encoding supported) so
6398 * far. We can just cast it */
6399 if (byval->encoding == REDIS_ENCODING_INT) {
6400 vector[j].u.score = (long)byval->ptr;
6401 } else
6402 redisAssert(1 != 1);
6403 }
6404 }
6405 } else {
6406 if (!alpha) {
6407 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6408 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6409 else {
6410 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6411 vector[j].u.score = (long) vector[j].obj->ptr;
6412 else
6413 redisAssert(1 != 1);
6414 }
6415 }
6416 }
6417 }
6418 }
6419
6420 /* We are ready to sort the vector... perform a bit of sanity check
6421 * on the LIMIT option too. We'll use a partial version of quicksort. */
6422 start = (limit_start < 0) ? 0 : limit_start;
6423 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6424 if (start >= vectorlen) {
6425 start = vectorlen-1;
6426 end = vectorlen-2;
6427 }
6428 if (end >= vectorlen) end = vectorlen-1;
6429
6430 if (dontsort == 0) {
6431 server.sort_desc = desc;
6432 server.sort_alpha = alpha;
6433 server.sort_bypattern = sortby ? 1 : 0;
6434 if (sortby && (start != 0 || end != vectorlen-1))
6435 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6436 else
6437 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6438 }
6439
6440 /* Send command output to the output buffer, performing the specified
6441 * GET/DEL/INCR/DECR operations if any. */
6442 outputlen = getop ? getop*(end-start+1) : end-start+1;
6443 if (storekey == NULL) {
6444 /* STORE option not specified, sent the sorting result to client */
6445 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6446 for (j = start; j <= end; j++) {
6447 listNode *ln;
6448 listIter li;
6449
6450 if (!getop) addReplyBulk(c,vector[j].obj);
6451 listRewind(operations,&li);
6452 while((ln = listNext(&li))) {
6453 redisSortOperation *sop = ln->value;
6454 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6455 vector[j].obj);
6456
6457 if (sop->type == REDIS_SORT_GET) {
6458 if (!val || val->type != REDIS_STRING) {
6459 addReply(c,shared.nullbulk);
6460 } else {
6461 addReplyBulk(c,val);
6462 }
6463 } else {
6464 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6465 }
6466 }
6467 }
6468 } else {
6469 robj *listObject = createListObject();
6470 list *listPtr = (list*) listObject->ptr;
6471
6472 /* STORE option specified, set the sorting result as a List object */
6473 for (j = start; j <= end; j++) {
6474 listNode *ln;
6475 listIter li;
6476
6477 if (!getop) {
6478 listAddNodeTail(listPtr,vector[j].obj);
6479 incrRefCount(vector[j].obj);
6480 }
6481 listRewind(operations,&li);
6482 while((ln = listNext(&li))) {
6483 redisSortOperation *sop = ln->value;
6484 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6485 vector[j].obj);
6486
6487 if (sop->type == REDIS_SORT_GET) {
6488 if (!val || val->type != REDIS_STRING) {
6489 listAddNodeTail(listPtr,createStringObject("",0));
6490 } else {
6491 listAddNodeTail(listPtr,val);
6492 incrRefCount(val);
6493 }
6494 } else {
6495 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6496 }
6497 }
6498 }
6499 if (dictReplace(c->db->dict,storekey,listObject)) {
6500 incrRefCount(storekey);
6501 }
6502 /* Note: we add 1 because the DB is dirty anyway since even if the
6503 * SORT result is empty a new key is set and maybe the old content
6504 * replaced. */
6505 server.dirty += 1+outputlen;
6506 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6507 }
6508
6509 /* Cleanup */
6510 decrRefCount(sortval);
6511 listRelease(operations);
6512 for (j = 0; j < vectorlen; j++) {
6513 if (sortby && alpha && vector[j].u.cmpobj)
6514 decrRefCount(vector[j].u.cmpobj);
6515 }
6516 zfree(vector);
6517 }
6518
6519 /* Convert an amount of bytes into a human readable string in the form
6520 * of 100B, 2G, 100M, 4K, and so forth. */
6521 static void bytesToHuman(char *s, unsigned long long n) {
6522 double d;
6523
6524 if (n < 1024) {
6525 /* Bytes */
6526 sprintf(s,"%lluB",n);
6527 return;
6528 } else if (n < (1024*1024)) {
6529 d = (double)n/(1024);
6530 sprintf(s,"%.2fK",d);
6531 } else if (n < (1024LL*1024*1024)) {
6532 d = (double)n/(1024*1024);
6533 sprintf(s,"%.2fM",d);
6534 } else if (n < (1024LL*1024*1024*1024)) {
6535 d = (double)n/(1024LL*1024*1024);
6536 sprintf(s,"%.2fG",d);
6537 }
6538 }
6539
6540 /* Create the string returned by the INFO command. This is decoupled
6541 * by the INFO command itself as we need to report the same information
6542 * on memory corruption problems. */
6543 static sds genRedisInfoString(void) {
6544 sds info;
6545 time_t uptime = time(NULL)-server.stat_starttime;
6546 int j;
6547 char hmem[64];
6548
6549 bytesToHuman(hmem,zmalloc_used_memory());
6550 info = sdscatprintf(sdsempty(),
6551 "redis_version:%s\r\n"
6552 "arch_bits:%s\r\n"
6553 "multiplexing_api:%s\r\n"
6554 "process_id:%ld\r\n"
6555 "uptime_in_seconds:%ld\r\n"
6556 "uptime_in_days:%ld\r\n"
6557 "connected_clients:%d\r\n"
6558 "connected_slaves:%d\r\n"
6559 "blocked_clients:%d\r\n"
6560 "used_memory:%zu\r\n"
6561 "used_memory_human:%s\r\n"
6562 "changes_since_last_save:%lld\r\n"
6563 "bgsave_in_progress:%d\r\n"
6564 "last_save_time:%ld\r\n"
6565 "bgrewriteaof_in_progress:%d\r\n"
6566 "total_connections_received:%lld\r\n"
6567 "total_commands_processed:%lld\r\n"
6568 "expired_keys:%lld\r\n"
6569 "hash_max_zipmap_entries:%ld\r\n"
6570 "hash_max_zipmap_value:%ld\r\n"
6571 "vm_enabled:%d\r\n"
6572 "role:%s\r\n"
6573 ,REDIS_VERSION,
6574 (sizeof(long) == 8) ? "64" : "32",
6575 aeGetApiName(),
6576 (long) getpid(),
6577 uptime,
6578 uptime/(3600*24),
6579 listLength(server.clients)-listLength(server.slaves),
6580 listLength(server.slaves),
6581 server.blpop_blocked_clients,
6582 zmalloc_used_memory(),
6583 hmem,
6584 server.dirty,
6585 server.bgsavechildpid != -1,
6586 server.lastsave,
6587 server.bgrewritechildpid != -1,
6588 server.stat_numconnections,
6589 server.stat_numcommands,
6590 server.stat_expiredkeys,
6591 server.hash_max_zipmap_entries,
6592 server.hash_max_zipmap_value,
6593 server.vm_enabled != 0,
6594 server.masterhost == NULL ? "master" : "slave"
6595 );
6596 if (server.masterhost) {
6597 info = sdscatprintf(info,
6598 "master_host:%s\r\n"
6599 "master_port:%d\r\n"
6600 "master_link_status:%s\r\n"
6601 "master_last_io_seconds_ago:%d\r\n"
6602 ,server.masterhost,
6603 server.masterport,
6604 (server.replstate == REDIS_REPL_CONNECTED) ?
6605 "up" : "down",
6606 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6607 );
6608 }
6609 if (server.vm_enabled) {
6610 lockThreadedIO();
6611 info = sdscatprintf(info,
6612 "vm_conf_max_memory:%llu\r\n"
6613 "vm_conf_page_size:%llu\r\n"
6614 "vm_conf_pages:%llu\r\n"
6615 "vm_stats_used_pages:%llu\r\n"
6616 "vm_stats_swapped_objects:%llu\r\n"
6617 "vm_stats_swappin_count:%llu\r\n"
6618 "vm_stats_swappout_count:%llu\r\n"
6619 "vm_stats_io_newjobs_len:%lu\r\n"
6620 "vm_stats_io_processing_len:%lu\r\n"
6621 "vm_stats_io_processed_len:%lu\r\n"
6622 "vm_stats_io_active_threads:%lu\r\n"
6623 "vm_stats_blocked_clients:%lu\r\n"
6624 ,(unsigned long long) server.vm_max_memory,
6625 (unsigned long long) server.vm_page_size,
6626 (unsigned long long) server.vm_pages,
6627 (unsigned long long) server.vm_stats_used_pages,
6628 (unsigned long long) server.vm_stats_swapped_objects,
6629 (unsigned long long) server.vm_stats_swapins,
6630 (unsigned long long) server.vm_stats_swapouts,
6631 (unsigned long) listLength(server.io_newjobs),
6632 (unsigned long) listLength(server.io_processing),
6633 (unsigned long) listLength(server.io_processed),
6634 (unsigned long) server.io_active_threads,
6635 (unsigned long) server.vm_blocked_clients
6636 );
6637 unlockThreadedIO();
6638 }
6639 for (j = 0; j < server.dbnum; j++) {
6640 long long keys, vkeys;
6641
6642 keys = dictSize(server.db[j].dict);
6643 vkeys = dictSize(server.db[j].expires);
6644 if (keys || vkeys) {
6645 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6646 j, keys, vkeys);
6647 }
6648 }
6649 return info;
6650 }
6651
6652 static void infoCommand(redisClient *c) {
6653 sds info = genRedisInfoString();
6654 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6655 (unsigned long)sdslen(info)));
6656 addReplySds(c,info);
6657 addReply(c,shared.crlf);
6658 }
6659
6660 static void monitorCommand(redisClient *c) {
6661 /* ignore MONITOR if aleady slave or in monitor mode */
6662 if (c->flags & REDIS_SLAVE) return;
6663
6664 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6665 c->slaveseldb = 0;
6666 listAddNodeTail(server.monitors,c);
6667 addReply(c,shared.ok);
6668 }
6669
6670 /* ================================= Expire ================================= */
6671 static int removeExpire(redisDb *db, robj *key) {
6672 if (dictDelete(db->expires,key) == DICT_OK) {
6673 return 1;
6674 } else {
6675 return 0;
6676 }
6677 }
6678
6679 static int setExpire(redisDb *db, robj *key, time_t when) {
6680 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6681 return 0;
6682 } else {
6683 incrRefCount(key);
6684 return 1;
6685 }
6686 }
6687
6688 /* Return the expire time of the specified key, or -1 if no expire
6689 * is associated with this key (i.e. the key is non volatile) */
6690 static time_t getExpire(redisDb *db, robj *key) {
6691 dictEntry *de;
6692
6693 /* No expire? return ASAP */
6694 if (dictSize(db->expires) == 0 ||
6695 (de = dictFind(db->expires,key)) == NULL) return -1;
6696
6697 return (time_t) dictGetEntryVal(de);
6698 }
6699
6700 static int expireIfNeeded(redisDb *db, robj *key) {
6701 time_t when;
6702 dictEntry *de;
6703
6704 /* No expire? return ASAP */
6705 if (dictSize(db->expires) == 0 ||
6706 (de = dictFind(db->expires,key)) == NULL) return 0;
6707
6708 /* Lookup the expire */
6709 when = (time_t) dictGetEntryVal(de);
6710 if (time(NULL) <= when) return 0;
6711
6712 /* Delete the key */
6713 dictDelete(db->expires,key);
6714 server.stat_expiredkeys++;
6715 return dictDelete(db->dict,key) == DICT_OK;
6716 }
6717
6718 static int deleteIfVolatile(redisDb *db, robj *key) {
6719 dictEntry *de;
6720
6721 /* No expire? return ASAP */
6722 if (dictSize(db->expires) == 0 ||
6723 (de = dictFind(db->expires,key)) == NULL) return 0;
6724
6725 /* Delete the key */
6726 server.dirty++;
6727 server.stat_expiredkeys++;
6728 dictDelete(db->expires,key);
6729 return dictDelete(db->dict,key) == DICT_OK;
6730 }
6731
6732 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6733 dictEntry *de;
6734
6735 de = dictFind(c->db->dict,key);
6736 if (de == NULL) {
6737 addReply(c,shared.czero);
6738 return;
6739 }
6740 if (seconds < 0) {
6741 if (deleteKey(c->db,key)) server.dirty++;
6742 addReply(c, shared.cone);
6743 return;
6744 } else {
6745 time_t when = time(NULL)+seconds;
6746 if (setExpire(c->db,key,when)) {
6747 addReply(c,shared.cone);
6748 server.dirty++;
6749 } else {
6750 addReply(c,shared.czero);
6751 }
6752 return;
6753 }
6754 }
6755
6756 static void expireCommand(redisClient *c) {
6757 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6758 }
6759
6760 static void expireatCommand(redisClient *c) {
6761 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6762 }
6763
6764 static void ttlCommand(redisClient *c) {
6765 time_t expire;
6766 int ttl = -1;
6767
6768 expire = getExpire(c->db,c->argv[1]);
6769 if (expire != -1) {
6770 ttl = (int) (expire-time(NULL));
6771 if (ttl < 0) ttl = -1;
6772 }
6773 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6774 }
6775
6776 /* ================================ MULTI/EXEC ============================== */
6777
6778 /* Client state initialization for MULTI/EXEC */
6779 static void initClientMultiState(redisClient *c) {
6780 c->mstate.commands = NULL;
6781 c->mstate.count = 0;
6782 }
6783
6784 /* Release all the resources associated with MULTI/EXEC state */
6785 static void freeClientMultiState(redisClient *c) {
6786 int j;
6787
6788 for (j = 0; j < c->mstate.count; j++) {
6789 int i;
6790 multiCmd *mc = c->mstate.commands+j;
6791
6792 for (i = 0; i < mc->argc; i++)
6793 decrRefCount(mc->argv[i]);
6794 zfree(mc->argv);
6795 }
6796 zfree(c->mstate.commands);
6797 }
6798
6799 /* Add a new command into the MULTI commands queue */
6800 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6801 multiCmd *mc;
6802 int j;
6803
6804 c->mstate.commands = zrealloc(c->mstate.commands,
6805 sizeof(multiCmd)*(c->mstate.count+1));
6806 mc = c->mstate.commands+c->mstate.count;
6807 mc->cmd = cmd;
6808 mc->argc = c->argc;
6809 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6810 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6811 for (j = 0; j < c->argc; j++)
6812 incrRefCount(mc->argv[j]);
6813 c->mstate.count++;
6814 }
6815
6816 static void multiCommand(redisClient *c) {
6817 c->flags |= REDIS_MULTI;
6818 addReply(c,shared.ok);
6819 }
6820
6821 static void discardCommand(redisClient *c) {
6822 if (!(c->flags & REDIS_MULTI)) {
6823 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6824 return;
6825 }
6826
6827 freeClientMultiState(c);
6828 initClientMultiState(c);
6829 c->flags &= (~REDIS_MULTI);
6830 addReply(c,shared.ok);
6831 }
6832
6833 static void execCommand(redisClient *c) {
6834 int j;
6835 robj **orig_argv;
6836 int orig_argc;
6837
6838 if (!(c->flags & REDIS_MULTI)) {
6839 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6840 return;
6841 }
6842
6843 orig_argv = c->argv;
6844 orig_argc = c->argc;
6845 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6846 for (j = 0; j < c->mstate.count; j++) {
6847 c->argc = c->mstate.commands[j].argc;
6848 c->argv = c->mstate.commands[j].argv;
6849 call(c,c->mstate.commands[j].cmd);
6850 }
6851 c->argv = orig_argv;
6852 c->argc = orig_argc;
6853 freeClientMultiState(c);
6854 initClientMultiState(c);
6855 c->flags &= (~REDIS_MULTI);
6856 }
6857
6858 /* =========================== Blocking Operations ========================= */
6859
6860 /* Currently Redis blocking operations support is limited to list POP ops,
6861 * so the current implementation is not fully generic, but it is also not
6862 * completely specific so it will not require a rewrite to support new
6863 * kind of blocking operations in the future.
6864 *
6865 * Still it's important to note that list blocking operations can be already
6866 * used as a notification mechanism in order to implement other blocking
6867 * operations at application level, so there must be a very strong evidence
6868 * of usefulness and generality before new blocking operations are implemented.
6869 *
6870 * This is how the current blocking POP works, we use BLPOP as example:
6871 * - If the user calls BLPOP and the key exists and contains a non empty list
6872 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6873 * if there is not to block.
6874 * - If instead BLPOP is called and the key does not exists or the list is
6875 * empty we need to block. In order to do so we remove the notification for
6876 * new data to read in the client socket (so that we'll not serve new
6877 * requests if the blocking request is not served). Also we put the client
6878 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6879 * blocking for this keys.
6880 * - If a PUSH operation against a key with blocked clients waiting is
6881 * performed, we serve the first in the list: basically instead to push
6882 * the new element inside the list we return it to the (first / oldest)
6883 * blocking client, unblock the client, and remove it form the list.
6884 *
6885 * The above comment and the source code should be enough in order to understand
6886 * the implementation and modify / fix it later.
6887 */
6888
6889 /* Set a client in blocking mode for the specified key, with the specified
6890 * timeout */
6891 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6892 dictEntry *de;
6893 list *l;
6894 int j;
6895
6896 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6897 c->blockingkeysnum = numkeys;
6898 c->blockingto = timeout;
6899 for (j = 0; j < numkeys; j++) {
6900 /* Add the key in the client structure, to map clients -> keys */
6901 c->blockingkeys[j] = keys[j];
6902 incrRefCount(keys[j]);
6903
6904 /* And in the other "side", to map keys -> clients */
6905 de = dictFind(c->db->blockingkeys,keys[j]);
6906 if (de == NULL) {
6907 int retval;
6908
6909 /* For every key we take a list of clients blocked for it */
6910 l = listCreate();
6911 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6912 incrRefCount(keys[j]);
6913 assert(retval == DICT_OK);
6914 } else {
6915 l = dictGetEntryVal(de);
6916 }
6917 listAddNodeTail(l,c);
6918 }
6919 /* Mark the client as a blocked client */
6920 c->flags |= REDIS_BLOCKED;
6921 server.blpop_blocked_clients++;
6922 }
6923
6924 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6925 static void unblockClientWaitingData(redisClient *c) {
6926 dictEntry *de;
6927 list *l;
6928 int j;
6929
6930 assert(c->blockingkeys != NULL);
6931 /* The client may wait for multiple keys, so unblock it for every key. */
6932 for (j = 0; j < c->blockingkeysnum; j++) {
6933 /* Remove this client from the list of clients waiting for this key. */
6934 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6935 assert(de != NULL);
6936 l = dictGetEntryVal(de);
6937 listDelNode(l,listSearchKey(l,c));
6938 /* If the list is empty we need to remove it to avoid wasting memory */
6939 if (listLength(l) == 0)
6940 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6941 decrRefCount(c->blockingkeys[j]);
6942 }
6943 /* Cleanup the client structure */
6944 zfree(c->blockingkeys);
6945 c->blockingkeys = NULL;
6946 c->flags &= (~REDIS_BLOCKED);
6947 server.blpop_blocked_clients--;
6948 /* We want to process data if there is some command waiting
6949 * in the input buffer. Note that this is safe even if
6950 * unblockClientWaitingData() gets called from freeClient() because
6951 * freeClient() will be smart enough to call this function
6952 * *after* c->querybuf was set to NULL. */
6953 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6954 }
6955
6956 /* This should be called from any function PUSHing into lists.
6957 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6958 * 'ele' is the element pushed.
6959 *
6960 * If the function returns 0 there was no client waiting for a list push
6961 * against this key.
6962 *
6963 * If the function returns 1 there was a client waiting for a list push
6964 * against this key, the element was passed to this client thus it's not
6965 * needed to actually add it to the list and the caller should return asap. */
6966 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6967 struct dictEntry *de;
6968 redisClient *receiver;
6969 list *l;
6970 listNode *ln;
6971
6972 de = dictFind(c->db->blockingkeys,key);
6973 if (de == NULL) return 0;
6974 l = dictGetEntryVal(de);
6975 ln = listFirst(l);
6976 assert(ln != NULL);
6977 receiver = ln->value;
6978
6979 addReplySds(receiver,sdsnew("*2\r\n"));
6980 addReplyBulk(receiver,key);
6981 addReplyBulk(receiver,ele);
6982 unblockClientWaitingData(receiver);
6983 return 1;
6984 }
6985
6986 /* Blocking RPOP/LPOP */
6987 static void blockingPopGenericCommand(redisClient *c, int where) {
6988 robj *o;
6989 time_t timeout;
6990 int j;
6991
6992 for (j = 1; j < c->argc-1; j++) {
6993 o = lookupKeyWrite(c->db,c->argv[j]);
6994 if (o != NULL) {
6995 if (o->type != REDIS_LIST) {
6996 addReply(c,shared.wrongtypeerr);
6997 return;
6998 } else {
6999 list *list = o->ptr;
7000 if (listLength(list) != 0) {
7001 /* If the list contains elements fall back to the usual
7002 * non-blocking POP operation */
7003 robj *argv[2], **orig_argv;
7004 int orig_argc;
7005
7006 /* We need to alter the command arguments before to call
7007 * popGenericCommand() as the command takes a single key. */
7008 orig_argv = c->argv;
7009 orig_argc = c->argc;
7010 argv[1] = c->argv[j];
7011 c->argv = argv;
7012 c->argc = 2;
7013
7014 /* Also the return value is different, we need to output
7015 * the multi bulk reply header and the key name. The
7016 * "real" command will add the last element (the value)
7017 * for us. If this souds like an hack to you it's just
7018 * because it is... */
7019 addReplySds(c,sdsnew("*2\r\n"));
7020 addReplyBulk(c,argv[1]);
7021 popGenericCommand(c,where);
7022
7023 /* Fix the client structure with the original stuff */
7024 c->argv = orig_argv;
7025 c->argc = orig_argc;
7026 return;
7027 }
7028 }
7029 }
7030 }
7031 /* If the list is empty or the key does not exists we must block */
7032 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7033 if (timeout > 0) timeout += time(NULL);
7034 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7035 }
7036
7037 static void blpopCommand(redisClient *c) {
7038 blockingPopGenericCommand(c,REDIS_HEAD);
7039 }
7040
7041 static void brpopCommand(redisClient *c) {
7042 blockingPopGenericCommand(c,REDIS_TAIL);
7043 }
7044
7045 /* =============================== Replication ============================= */
7046
7047 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7048 ssize_t nwritten, ret = size;
7049 time_t start = time(NULL);
7050
7051 timeout++;
7052 while(size) {
7053 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7054 nwritten = write(fd,ptr,size);
7055 if (nwritten == -1) return -1;
7056 ptr += nwritten;
7057 size -= nwritten;
7058 }
7059 if ((time(NULL)-start) > timeout) {
7060 errno = ETIMEDOUT;
7061 return -1;
7062 }
7063 }
7064 return ret;
7065 }
7066
7067 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7068 ssize_t nread, totread = 0;
7069 time_t start = time(NULL);
7070
7071 timeout++;
7072 while(size) {
7073 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7074 nread = read(fd,ptr,size);
7075 if (nread == -1) return -1;
7076 ptr += nread;
7077 size -= nread;
7078 totread += nread;
7079 }
7080 if ((time(NULL)-start) > timeout) {
7081 errno = ETIMEDOUT;
7082 return -1;
7083 }
7084 }
7085 return totread;
7086 }
7087
7088 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7089 ssize_t nread = 0;
7090
7091 size--;
7092 while(size) {
7093 char c;
7094
7095 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7096 if (c == '\n') {
7097 *ptr = '\0';
7098 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7099 return nread;
7100 } else {
7101 *ptr++ = c;
7102 *ptr = '\0';
7103 nread++;
7104 }
7105 }
7106 return nread;
7107 }
7108
7109 static void syncCommand(redisClient *c) {
7110 /* ignore SYNC if aleady slave or in monitor mode */
7111 if (c->flags & REDIS_SLAVE) return;
7112
7113 /* SYNC can't be issued when the server has pending data to send to
7114 * the client about already issued commands. We need a fresh reply
7115 * buffer registering the differences between the BGSAVE and the current
7116 * dataset, so that we can copy to other slaves if needed. */
7117 if (listLength(c->reply) != 0) {
7118 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7119 return;
7120 }
7121
7122 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7123 /* Here we need to check if there is a background saving operation
7124 * in progress, or if it is required to start one */
7125 if (server.bgsavechildpid != -1) {
7126 /* Ok a background save is in progress. Let's check if it is a good
7127 * one for replication, i.e. if there is another slave that is
7128 * registering differences since the server forked to save */
7129 redisClient *slave;
7130 listNode *ln;
7131 listIter li;
7132
7133 listRewind(server.slaves,&li);
7134 while((ln = listNext(&li))) {
7135 slave = ln->value;
7136 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7137 }
7138 if (ln) {
7139 /* Perfect, the server is already registering differences for
7140 * another slave. Set the right state, and copy the buffer. */
7141 listRelease(c->reply);
7142 c->reply = listDup(slave->reply);
7143 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7144 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7145 } else {
7146 /* No way, we need to wait for the next BGSAVE in order to
7147 * register differences */
7148 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7149 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7150 }
7151 } else {
7152 /* Ok we don't have a BGSAVE in progress, let's start one */
7153 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7154 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7155 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7156 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7157 return;
7158 }
7159 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7160 }
7161 c->repldbfd = -1;
7162 c->flags |= REDIS_SLAVE;
7163 c->slaveseldb = 0;
7164 listAddNodeTail(server.slaves,c);
7165 return;
7166 }
7167
7168 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7169 redisClient *slave = privdata;
7170 REDIS_NOTUSED(el);
7171 REDIS_NOTUSED(mask);
7172 char buf[REDIS_IOBUF_LEN];
7173 ssize_t nwritten, buflen;
7174
7175 if (slave->repldboff == 0) {
7176 /* Write the bulk write count before to transfer the DB. In theory here
7177 * we don't know how much room there is in the output buffer of the
7178 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7179 * operations) will never be smaller than the few bytes we need. */
7180 sds bulkcount;
7181
7182 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7183 slave->repldbsize);
7184 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7185 {
7186 sdsfree(bulkcount);
7187 freeClient(slave);
7188 return;
7189 }
7190 sdsfree(bulkcount);
7191 }
7192 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7193 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7194 if (buflen <= 0) {
7195 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7196 (buflen == 0) ? "premature EOF" : strerror(errno));
7197 freeClient(slave);
7198 return;
7199 }
7200 if ((nwritten = write(fd,buf,buflen)) == -1) {
7201 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7202 strerror(errno));
7203 freeClient(slave);
7204 return;
7205 }
7206 slave->repldboff += nwritten;
7207 if (slave->repldboff == slave->repldbsize) {
7208 close(slave->repldbfd);
7209 slave->repldbfd = -1;
7210 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7211 slave->replstate = REDIS_REPL_ONLINE;
7212 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7213 sendReplyToClient, slave) == AE_ERR) {
7214 freeClient(slave);
7215 return;
7216 }
7217 addReplySds(slave,sdsempty());
7218 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7219 }
7220 }
7221
7222 /* This function is called at the end of every backgrond saving.
7223 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7224 * otherwise REDIS_ERR is passed to the function.
7225 *
7226 * The goal of this function is to handle slaves waiting for a successful
7227 * background saving in order to perform non-blocking synchronization. */
7228 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7229 listNode *ln;
7230 int startbgsave = 0;
7231 listIter li;
7232
7233 listRewind(server.slaves,&li);
7234 while((ln = listNext(&li))) {
7235 redisClient *slave = ln->value;
7236
7237 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7238 startbgsave = 1;
7239 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7240 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7241 struct redis_stat buf;
7242
7243 if (bgsaveerr != REDIS_OK) {
7244 freeClient(slave);
7245 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7246 continue;
7247 }
7248 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7249 redis_fstat(slave->repldbfd,&buf) == -1) {
7250 freeClient(slave);
7251 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7252 continue;
7253 }
7254 slave->repldboff = 0;
7255 slave->repldbsize = buf.st_size;
7256 slave->replstate = REDIS_REPL_SEND_BULK;
7257 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7258 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7259 freeClient(slave);
7260 continue;
7261 }
7262 }
7263 }
7264 if (startbgsave) {
7265 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7266 listIter li;
7267
7268 listRewind(server.slaves,&li);
7269 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7270 while((ln = listNext(&li))) {
7271 redisClient *slave = ln->value;
7272
7273 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7274 freeClient(slave);
7275 }
7276 }
7277 }
7278 }
7279
7280 static int syncWithMaster(void) {
7281 char buf[1024], tmpfile[256], authcmd[1024];
7282 long dumpsize;
7283 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7284 int dfd, maxtries = 5;
7285
7286 if (fd == -1) {
7287 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7288 strerror(errno));
7289 return REDIS_ERR;
7290 }
7291
7292 /* AUTH with the master if required. */
7293 if(server.masterauth) {
7294 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7295 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7296 close(fd);
7297 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7298 strerror(errno));
7299 return REDIS_ERR;
7300 }
7301 /* Read the AUTH result. */
7302 if (syncReadLine(fd,buf,1024,3600) == -1) {
7303 close(fd);
7304 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7305 strerror(errno));
7306 return REDIS_ERR;
7307 }
7308 if (buf[0] != '+') {
7309 close(fd);
7310 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7311 return REDIS_ERR;
7312 }
7313 }
7314
7315 /* Issue the SYNC command */
7316 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7317 close(fd);
7318 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7319 strerror(errno));
7320 return REDIS_ERR;
7321 }
7322 /* Read the bulk write count */
7323 if (syncReadLine(fd,buf,1024,3600) == -1) {
7324 close(fd);
7325 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7326 strerror(errno));
7327 return REDIS_ERR;
7328 }
7329 if (buf[0] != '$') {
7330 close(fd);
7331 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7332 return REDIS_ERR;
7333 }
7334 dumpsize = strtol(buf+1,NULL,10);
7335 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7336 /* Read the bulk write data on a temp file */
7337 while(maxtries--) {
7338 snprintf(tmpfile,256,
7339 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7340 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7341 if (dfd != -1) break;
7342 sleep(1);
7343 }
7344 if (dfd == -1) {
7345 close(fd);
7346 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7347 return REDIS_ERR;
7348 }
7349 while(dumpsize) {
7350 int nread, nwritten;
7351
7352 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7353 if (nread == -1) {
7354 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7355 strerror(errno));
7356 close(fd);
7357 close(dfd);
7358 return REDIS_ERR;
7359 }
7360 nwritten = write(dfd,buf,nread);
7361 if (nwritten == -1) {
7362 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7363 close(fd);
7364 close(dfd);
7365 return REDIS_ERR;
7366 }
7367 dumpsize -= nread;
7368 }
7369 close(dfd);
7370 if (rename(tmpfile,server.dbfilename) == -1) {
7371 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7372 unlink(tmpfile);
7373 close(fd);
7374 return REDIS_ERR;
7375 }
7376 emptyDb();
7377 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7378 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7379 close(fd);
7380 return REDIS_ERR;
7381 }
7382 server.master = createClient(fd);
7383 server.master->flags |= REDIS_MASTER;
7384 server.master->authenticated = 1;
7385 server.replstate = REDIS_REPL_CONNECTED;
7386 return REDIS_OK;
7387 }
7388
7389 static void slaveofCommand(redisClient *c) {
7390 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7391 !strcasecmp(c->argv[2]->ptr,"one")) {
7392 if (server.masterhost) {
7393 sdsfree(server.masterhost);
7394 server.masterhost = NULL;
7395 if (server.master) freeClient(server.master);
7396 server.replstate = REDIS_REPL_NONE;
7397 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7398 }
7399 } else {
7400 sdsfree(server.masterhost);
7401 server.masterhost = sdsdup(c->argv[1]->ptr);
7402 server.masterport = atoi(c->argv[2]->ptr);
7403 if (server.master) freeClient(server.master);
7404 server.replstate = REDIS_REPL_CONNECT;
7405 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7406 server.masterhost, server.masterport);
7407 }
7408 addReply(c,shared.ok);
7409 }
7410
7411 /* ============================ Maxmemory directive ======================== */
7412
7413 /* Try to free one object form the pre-allocated objects free list.
7414 * This is useful under low mem conditions as by default we take 1 million
7415 * free objects allocated. On success REDIS_OK is returned, otherwise
7416 * REDIS_ERR. */
7417 static int tryFreeOneObjectFromFreelist(void) {
7418 robj *o;
7419
7420 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7421 if (listLength(server.objfreelist)) {
7422 listNode *head = listFirst(server.objfreelist);
7423 o = listNodeValue(head);
7424 listDelNode(server.objfreelist,head);
7425 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7426 zfree(o);
7427 return REDIS_OK;
7428 } else {
7429 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7430 return REDIS_ERR;
7431 }
7432 }
7433
7434 /* This function gets called when 'maxmemory' is set on the config file to limit
7435 * the max memory used by the server, and we are out of memory.
7436 * This function will try to, in order:
7437 *
7438 * - Free objects from the free list
7439 * - Try to remove keys with an EXPIRE set
7440 *
7441 * It is not possible to free enough memory to reach used-memory < maxmemory
7442 * the server will start refusing commands that will enlarge even more the
7443 * memory usage.
7444 */
7445 static void freeMemoryIfNeeded(void) {
7446 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7447 int j, k, freed = 0;
7448
7449 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7450 for (j = 0; j < server.dbnum; j++) {
7451 int minttl = -1;
7452 robj *minkey = NULL;
7453 struct dictEntry *de;
7454
7455 if (dictSize(server.db[j].expires)) {
7456 freed = 1;
7457 /* From a sample of three keys drop the one nearest to
7458 * the natural expire */
7459 for (k = 0; k < 3; k++) {
7460 time_t t;
7461
7462 de = dictGetRandomKey(server.db[j].expires);
7463 t = (time_t) dictGetEntryVal(de);
7464 if (minttl == -1 || t < minttl) {
7465 minkey = dictGetEntryKey(de);
7466 minttl = t;
7467 }
7468 }
7469 deleteKey(server.db+j,minkey);
7470 }
7471 }
7472 if (!freed) return; /* nothing to free... */
7473 }
7474 }
7475
7476 /* ============================== Append Only file ========================== */
7477
7478 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7479 sds buf = sdsempty();
7480 int j;
7481 ssize_t nwritten;
7482 time_t now;
7483 robj *tmpargv[3];
7484
7485 /* The DB this command was targetting is not the same as the last command
7486 * we appendend. To issue a SELECT command is needed. */
7487 if (dictid != server.appendseldb) {
7488 char seldb[64];
7489
7490 snprintf(seldb,sizeof(seldb),"%d",dictid);
7491 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7492 (unsigned long)strlen(seldb),seldb);
7493 server.appendseldb = dictid;
7494 }
7495
7496 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7497 * EXPIREs into EXPIREATs calls */
7498 if (cmd->proc == expireCommand) {
7499 long when;
7500
7501 tmpargv[0] = createStringObject("EXPIREAT",8);
7502 tmpargv[1] = argv[1];
7503 incrRefCount(argv[1]);
7504 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7505 tmpargv[2] = createObject(REDIS_STRING,
7506 sdscatprintf(sdsempty(),"%ld",when));
7507 argv = tmpargv;
7508 }
7509
7510 /* Append the actual command */
7511 buf = sdscatprintf(buf,"*%d\r\n",argc);
7512 for (j = 0; j < argc; j++) {
7513 robj *o = argv[j];
7514
7515 o = getDecodedObject(o);
7516 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7517 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7518 buf = sdscatlen(buf,"\r\n",2);
7519 decrRefCount(o);
7520 }
7521
7522 /* Free the objects from the modified argv for EXPIREAT */
7523 if (cmd->proc == expireCommand) {
7524 for (j = 0; j < 3; j++)
7525 decrRefCount(argv[j]);
7526 }
7527
7528 /* We want to perform a single write. This should be guaranteed atomic
7529 * at least if the filesystem we are writing is a real physical one.
7530 * While this will save us against the server being killed I don't think
7531 * there is much to do about the whole server stopping for power problems
7532 * or alike */
7533 nwritten = write(server.appendfd,buf,sdslen(buf));
7534 if (nwritten != (signed)sdslen(buf)) {
7535 /* Ooops, we are in troubles. The best thing to do for now is
7536 * to simply exit instead to give the illusion that everything is
7537 * working as expected. */
7538 if (nwritten == -1) {
7539 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7540 } else {
7541 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7542 }
7543 exit(1);
7544 }
7545 /* If a background append only file rewriting is in progress we want to
7546 * accumulate the differences between the child DB and the current one
7547 * in a buffer, so that when the child process will do its work we
7548 * can append the differences to the new append only file. */
7549 if (server.bgrewritechildpid != -1)
7550 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7551
7552 sdsfree(buf);
7553 now = time(NULL);
7554 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7555 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7556 now-server.lastfsync > 1))
7557 {
7558 fsync(server.appendfd); /* Let's try to get this data on the disk */
7559 server.lastfsync = now;
7560 }
7561 }
7562
7563 /* In Redis commands are always executed in the context of a client, so in
7564 * order to load the append only file we need to create a fake client. */
7565 static struct redisClient *createFakeClient(void) {
7566 struct redisClient *c = zmalloc(sizeof(*c));
7567
7568 selectDb(c,0);
7569 c->fd = -1;
7570 c->querybuf = sdsempty();
7571 c->argc = 0;
7572 c->argv = NULL;
7573 c->flags = 0;
7574 /* We set the fake client as a slave waiting for the synchronization
7575 * so that Redis will not try to send replies to this client. */
7576 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7577 c->reply = listCreate();
7578 listSetFreeMethod(c->reply,decrRefCount);
7579 listSetDupMethod(c->reply,dupClientReplyValue);
7580 return c;
7581 }
7582
7583 static void freeFakeClient(struct redisClient *c) {
7584 sdsfree(c->querybuf);
7585 listRelease(c->reply);
7586 zfree(c);
7587 }
7588
7589 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7590 * error (the append only file is zero-length) REDIS_ERR is returned. On
7591 * fatal error an error message is logged and the program exists. */
7592 int loadAppendOnlyFile(char *filename) {
7593 struct redisClient *fakeClient;
7594 FILE *fp = fopen(filename,"r");
7595 struct redis_stat sb;
7596 unsigned long long loadedkeys = 0;
7597
7598 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7599 return REDIS_ERR;
7600
7601 if (fp == NULL) {
7602 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7603 exit(1);
7604 }
7605
7606 fakeClient = createFakeClient();
7607 while(1) {
7608 int argc, j;
7609 unsigned long len;
7610 robj **argv;
7611 char buf[128];
7612 sds argsds;
7613 struct redisCommand *cmd;
7614
7615 if (fgets(buf,sizeof(buf),fp) == NULL) {
7616 if (feof(fp))
7617 break;
7618 else
7619 goto readerr;
7620 }
7621 if (buf[0] != '*') goto fmterr;
7622 argc = atoi(buf+1);
7623 argv = zmalloc(sizeof(robj*)*argc);
7624 for (j = 0; j < argc; j++) {
7625 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7626 if (buf[0] != '$') goto fmterr;
7627 len = strtol(buf+1,NULL,10);
7628 argsds = sdsnewlen(NULL,len);
7629 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7630 argv[j] = createObject(REDIS_STRING,argsds);
7631 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7632 }
7633
7634 /* Command lookup */
7635 cmd = lookupCommand(argv[0]->ptr);
7636 if (!cmd) {
7637 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7638 exit(1);
7639 }
7640 /* Try object sharing and encoding */
7641 if (server.shareobjects) {
7642 int j;
7643 for(j = 1; j < argc; j++)
7644 argv[j] = tryObjectSharing(argv[j]);
7645 }
7646 if (cmd->flags & REDIS_CMD_BULK)
7647 tryObjectEncoding(argv[argc-1]);
7648 /* Run the command in the context of a fake client */
7649 fakeClient->argc = argc;
7650 fakeClient->argv = argv;
7651 cmd->proc(fakeClient);
7652 /* Discard the reply objects list from the fake client */
7653 while(listLength(fakeClient->reply))
7654 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7655 /* Clean up, ready for the next command */
7656 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7657 zfree(argv);
7658 /* Handle swapping while loading big datasets when VM is on */
7659 loadedkeys++;
7660 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7661 while (zmalloc_used_memory() > server.vm_max_memory) {
7662 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7663 }
7664 }
7665 }
7666 fclose(fp);
7667 freeFakeClient(fakeClient);
7668 return REDIS_OK;
7669
7670 readerr:
7671 if (feof(fp)) {
7672 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7673 } else {
7674 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7675 }
7676 exit(1);
7677 fmterr:
7678 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7679 exit(1);
7680 }
7681
7682 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7683 static int fwriteBulkObject(FILE *fp, robj *obj) {
7684 char buf[128];
7685 int decrrc = 0;
7686
7687 /* Avoid the incr/decr ref count business if possible to help
7688 * copy-on-write (we are often in a child process when this function
7689 * is called).
7690 * Also makes sure that key objects don't get incrRefCount-ed when VM
7691 * is enabled */
7692 if (obj->encoding != REDIS_ENCODING_RAW) {
7693 obj = getDecodedObject(obj);
7694 decrrc = 1;
7695 }
7696 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7697 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7698 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7699 goto err;
7700 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7701 if (decrrc) decrRefCount(obj);
7702 return 1;
7703 err:
7704 if (decrrc) decrRefCount(obj);
7705 return 0;
7706 }
7707
7708 /* Write binary-safe string into a file in the bulkformat
7709 * $<count>\r\n<payload>\r\n */
7710 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7711 char buf[128];
7712
7713 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7714 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7715 if (len && fwrite(s,len,1,fp) == 0) return 0;
7716 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7717 return 1;
7718 }
7719
7720 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7721 static int fwriteBulkDouble(FILE *fp, double d) {
7722 char buf[128], dbuf[128];
7723
7724 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7725 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7726 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7727 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7728 return 1;
7729 }
7730
7731 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7732 static int fwriteBulkLong(FILE *fp, long l) {
7733 char buf[128], lbuf[128];
7734
7735 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7736 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7737 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7738 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7739 return 1;
7740 }
7741
7742 /* Write a sequence of commands able to fully rebuild the dataset into
7743 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7744 static int rewriteAppendOnlyFile(char *filename) {
7745 dictIterator *di = NULL;
7746 dictEntry *de;
7747 FILE *fp;
7748 char tmpfile[256];
7749 int j;
7750 time_t now = time(NULL);
7751
7752 /* Note that we have to use a different temp name here compared to the
7753 * one used by rewriteAppendOnlyFileBackground() function. */
7754 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7755 fp = fopen(tmpfile,"w");
7756 if (!fp) {
7757 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7758 return REDIS_ERR;
7759 }
7760 for (j = 0; j < server.dbnum; j++) {
7761 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7762 redisDb *db = server.db+j;
7763 dict *d = db->dict;
7764 if (dictSize(d) == 0) continue;
7765 di = dictGetIterator(d);
7766 if (!di) {
7767 fclose(fp);
7768 return REDIS_ERR;
7769 }
7770
7771 /* SELECT the new DB */
7772 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7773 if (fwriteBulkLong(fp,j) == 0) goto werr;
7774
7775 /* Iterate this DB writing every entry */
7776 while((de = dictNext(di)) != NULL) {
7777 robj *key, *o;
7778 time_t expiretime;
7779 int swapped;
7780
7781 key = dictGetEntryKey(de);
7782 /* If the value for this key is swapped, load a preview in memory.
7783 * We use a "swapped" flag to remember if we need to free the
7784 * value object instead to just increment the ref count anyway
7785 * in order to avoid copy-on-write of pages if we are forked() */
7786 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7787 key->storage == REDIS_VM_SWAPPING) {
7788 o = dictGetEntryVal(de);
7789 swapped = 0;
7790 } else {
7791 o = vmPreviewObject(key);
7792 swapped = 1;
7793 }
7794 expiretime = getExpire(db,key);
7795
7796 /* Save the key and associated value */
7797 if (o->type == REDIS_STRING) {
7798 /* Emit a SET command */
7799 char cmd[]="*3\r\n$3\r\nSET\r\n";
7800 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7801 /* Key and value */
7802 if (fwriteBulkObject(fp,key) == 0) goto werr;
7803 if (fwriteBulkObject(fp,o) == 0) goto werr;
7804 } else if (o->type == REDIS_LIST) {
7805 /* Emit the RPUSHes needed to rebuild the list */
7806 list *list = o->ptr;
7807 listNode *ln;
7808 listIter li;
7809
7810 listRewind(list,&li);
7811 while((ln = listNext(&li))) {
7812 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7813 robj *eleobj = listNodeValue(ln);
7814
7815 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7816 if (fwriteBulkObject(fp,key) == 0) goto werr;
7817 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7818 }
7819 } else if (o->type == REDIS_SET) {
7820 /* Emit the SADDs needed to rebuild the set */
7821 dict *set = o->ptr;
7822 dictIterator *di = dictGetIterator(set);
7823 dictEntry *de;
7824
7825 while((de = dictNext(di)) != NULL) {
7826 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7827 robj *eleobj = dictGetEntryKey(de);
7828
7829 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7830 if (fwriteBulkObject(fp,key) == 0) goto werr;
7831 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7832 }
7833 dictReleaseIterator(di);
7834 } else if (o->type == REDIS_ZSET) {
7835 /* Emit the ZADDs needed to rebuild the sorted set */
7836 zset *zs = o->ptr;
7837 dictIterator *di = dictGetIterator(zs->dict);
7838 dictEntry *de;
7839
7840 while((de = dictNext(di)) != NULL) {
7841 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7842 robj *eleobj = dictGetEntryKey(de);
7843 double *score = dictGetEntryVal(de);
7844
7845 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7846 if (fwriteBulkObject(fp,key) == 0) goto werr;
7847 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7848 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7849 }
7850 dictReleaseIterator(di);
7851 } else if (o->type == REDIS_HASH) {
7852 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7853
7854 /* Emit the HSETs needed to rebuild the hash */
7855 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7856 unsigned char *p = zipmapRewind(o->ptr);
7857 unsigned char *field, *val;
7858 unsigned int flen, vlen;
7859
7860 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7861 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7862 if (fwriteBulkObject(fp,key) == 0) goto werr;
7863 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7864 return -1;
7865 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7866 return -1;
7867 }
7868 } else {
7869 dictIterator *di = dictGetIterator(o->ptr);
7870 dictEntry *de;
7871
7872 while((de = dictNext(di)) != NULL) {
7873 robj *field = dictGetEntryKey(de);
7874 robj *val = dictGetEntryVal(de);
7875
7876 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7877 if (fwriteBulkObject(fp,key) == 0) goto werr;
7878 if (fwriteBulkObject(fp,field) == -1) return -1;
7879 if (fwriteBulkObject(fp,val) == -1) return -1;
7880 }
7881 dictReleaseIterator(di);
7882 }
7883 } else {
7884 redisAssert(0);
7885 }
7886 /* Save the expire time */
7887 if (expiretime != -1) {
7888 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7889 /* If this key is already expired skip it */
7890 if (expiretime < now) continue;
7891 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7892 if (fwriteBulkObject(fp,key) == 0) goto werr;
7893 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7894 }
7895 if (swapped) decrRefCount(o);
7896 }
7897 dictReleaseIterator(di);
7898 }
7899
7900 /* Make sure data will not remain on the OS's output buffers */
7901 fflush(fp);
7902 fsync(fileno(fp));
7903 fclose(fp);
7904
7905 /* Use RENAME to make sure the DB file is changed atomically only
7906 * if the generate DB file is ok. */
7907 if (rename(tmpfile,filename) == -1) {
7908 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7909 unlink(tmpfile);
7910 return REDIS_ERR;
7911 }
7912 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7913 return REDIS_OK;
7914
7915 werr:
7916 fclose(fp);
7917 unlink(tmpfile);
7918 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7919 if (di) dictReleaseIterator(di);
7920 return REDIS_ERR;
7921 }
7922
7923 /* This is how rewriting of the append only file in background works:
7924 *
7925 * 1) The user calls BGREWRITEAOF
7926 * 2) Redis calls this function, that forks():
7927 * 2a) the child rewrite the append only file in a temp file.
7928 * 2b) the parent accumulates differences in server.bgrewritebuf.
7929 * 3) When the child finished '2a' exists.
7930 * 4) The parent will trap the exit code, if it's OK, will append the
7931 * data accumulated into server.bgrewritebuf into the temp file, and
7932 * finally will rename(2) the temp file in the actual file name.
7933 * The the new file is reopened as the new append only file. Profit!
7934 */
7935 static int rewriteAppendOnlyFileBackground(void) {
7936 pid_t childpid;
7937
7938 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7939 if (server.vm_enabled) waitEmptyIOJobsQueue();
7940 if ((childpid = fork()) == 0) {
7941 /* Child */
7942 char tmpfile[256];
7943
7944 if (server.vm_enabled) vmReopenSwapFile();
7945 close(server.fd);
7946 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7947 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7948 _exit(0);
7949 } else {
7950 _exit(1);
7951 }
7952 } else {
7953 /* Parent */
7954 if (childpid == -1) {
7955 redisLog(REDIS_WARNING,
7956 "Can't rewrite append only file in background: fork: %s",
7957 strerror(errno));
7958 return REDIS_ERR;
7959 }
7960 redisLog(REDIS_NOTICE,
7961 "Background append only file rewriting started by pid %d",childpid);
7962 server.bgrewritechildpid = childpid;
7963 /* We set appendseldb to -1 in order to force the next call to the
7964 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7965 * accumulated by the parent into server.bgrewritebuf will start
7966 * with a SELECT statement and it will be safe to merge. */
7967 server.appendseldb = -1;
7968 return REDIS_OK;
7969 }
7970 return REDIS_OK; /* unreached */
7971 }
7972
7973 static void bgrewriteaofCommand(redisClient *c) {
7974 if (server.bgrewritechildpid != -1) {
7975 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7976 return;
7977 }
7978 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7979 char *status = "+Background append only file rewriting started\r\n";
7980 addReplySds(c,sdsnew(status));
7981 } else {
7982 addReply(c,shared.err);
7983 }
7984 }
7985
7986 static void aofRemoveTempFile(pid_t childpid) {
7987 char tmpfile[256];
7988
7989 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7990 unlink(tmpfile);
7991 }
7992
7993 /* Virtual Memory is composed mainly of two subsystems:
7994 * - Blocking Virutal Memory
7995 * - Threaded Virtual Memory I/O
7996 * The two parts are not fully decoupled, but functions are split among two
7997 * different sections of the source code (delimited by comments) in order to
7998 * make more clear what functionality is about the blocking VM and what about
7999 * the threaded (not blocking) VM.
8000 *
8001 * Redis VM design:
8002 *
8003 * Redis VM is a blocking VM (one that blocks reading swapped values from
8004 * disk into memory when a value swapped out is needed in memory) that is made
8005 * unblocking by trying to examine the command argument vector in order to
8006 * load in background values that will likely be needed in order to exec
8007 * the command. The command is executed only once all the relevant keys
8008 * are loaded into memory.
8009 *
8010 * This basically is almost as simple of a blocking VM, but almost as parallel
8011 * as a fully non-blocking VM.
8012 */
8013
8014 /* =================== Virtual Memory - Blocking Side ====================== */
8015
8016 /* substitute the first occurrence of '%p' with the process pid in the
8017 * swap file name. */
8018 static void expandVmSwapFilename(void) {
8019 char *p = strstr(server.vm_swap_file,"%p");
8020 sds new;
8021
8022 if (!p) return;
8023 new = sdsempty();
8024 *p = '\0';
8025 new = sdscat(new,server.vm_swap_file);
8026 new = sdscatprintf(new,"%ld",(long) getpid());
8027 new = sdscat(new,p+2);
8028 zfree(server.vm_swap_file);
8029 server.vm_swap_file = new;
8030 }
8031
8032 static void vmInit(void) {
8033 off_t totsize;
8034 int pipefds[2];
8035 size_t stacksize;
8036
8037 if (server.vm_max_threads != 0)
8038 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8039
8040 expandVmSwapFilename();
8041 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8042 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8043 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8044 }
8045 if (server.vm_fp == NULL) {
8046 redisLog(REDIS_WARNING,
8047 "Impossible to open the swap file: %s. Exiting.",
8048 strerror(errno));
8049 exit(1);
8050 }
8051 server.vm_fd = fileno(server.vm_fp);
8052 server.vm_next_page = 0;
8053 server.vm_near_pages = 0;
8054 server.vm_stats_used_pages = 0;
8055 server.vm_stats_swapped_objects = 0;
8056 server.vm_stats_swapouts = 0;
8057 server.vm_stats_swapins = 0;
8058 totsize = server.vm_pages*server.vm_page_size;
8059 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8060 if (ftruncate(server.vm_fd,totsize) == -1) {
8061 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8062 strerror(errno));
8063 exit(1);
8064 } else {
8065 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8066 }
8067 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8068 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8069 (long long) (server.vm_pages+7)/8, server.vm_pages);
8070 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8071
8072 /* Initialize threaded I/O (used by Virtual Memory) */
8073 server.io_newjobs = listCreate();
8074 server.io_processing = listCreate();
8075 server.io_processed = listCreate();
8076 server.io_ready_clients = listCreate();
8077 pthread_mutex_init(&server.io_mutex,NULL);
8078 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8079 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8080 server.io_active_threads = 0;
8081 if (pipe(pipefds) == -1) {
8082 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8083 ,strerror(errno));
8084 exit(1);
8085 }
8086 server.io_ready_pipe_read = pipefds[0];
8087 server.io_ready_pipe_write = pipefds[1];
8088 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8089 /* LZF requires a lot of stack */
8090 pthread_attr_init(&server.io_threads_attr);
8091 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8092 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8093 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8094 /* Listen for events in the threaded I/O pipe */
8095 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8096 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8097 oom("creating file event");
8098 }
8099
8100 /* Mark the page as used */
8101 static void vmMarkPageUsed(off_t page) {
8102 off_t byte = page/8;
8103 int bit = page&7;
8104 redisAssert(vmFreePage(page) == 1);
8105 server.vm_bitmap[byte] |= 1<<bit;
8106 }
8107
8108 /* Mark N contiguous pages as used, with 'page' being the first. */
8109 static void vmMarkPagesUsed(off_t page, off_t count) {
8110 off_t j;
8111
8112 for (j = 0; j < count; j++)
8113 vmMarkPageUsed(page+j);
8114 server.vm_stats_used_pages += count;
8115 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8116 (long long)count, (long long)page);
8117 }
8118
8119 /* Mark the page as free */
8120 static void vmMarkPageFree(off_t page) {
8121 off_t byte = page/8;
8122 int bit = page&7;
8123 redisAssert(vmFreePage(page) == 0);
8124 server.vm_bitmap[byte] &= ~(1<<bit);
8125 }
8126
8127 /* Mark N contiguous pages as free, with 'page' being the first. */
8128 static void vmMarkPagesFree(off_t page, off_t count) {
8129 off_t j;
8130
8131 for (j = 0; j < count; j++)
8132 vmMarkPageFree(page+j);
8133 server.vm_stats_used_pages -= count;
8134 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8135 (long long)count, (long long)page);
8136 }
8137
8138 /* Test if the page is free */
8139 static int vmFreePage(off_t page) {
8140 off_t byte = page/8;
8141 int bit = page&7;
8142 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8143 }
8144
8145 /* Find N contiguous free pages storing the first page of the cluster in *first.
8146 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8147 * REDIS_ERR is returned.
8148 *
8149 * This function uses a simple algorithm: we try to allocate
8150 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8151 * again from the start of the swap file searching for free spaces.
8152 *
8153 * If it looks pretty clear that there are no free pages near our offset
8154 * we try to find less populated places doing a forward jump of
8155 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8156 * without hurry, and then we jump again and so forth...
8157 *
8158 * This function can be improved using a free list to avoid to guess
8159 * too much, since we could collect data about freed pages.
8160 *
8161 * note: I implemented this function just after watching an episode of
8162 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8163 */
8164 static int vmFindContiguousPages(off_t *first, off_t n) {
8165 off_t base, offset = 0, since_jump = 0, numfree = 0;
8166
8167 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8168 server.vm_near_pages = 0;
8169 server.vm_next_page = 0;
8170 }
8171 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8172 base = server.vm_next_page;
8173
8174 while(offset < server.vm_pages) {
8175 off_t this = base+offset;
8176
8177 /* If we overflow, restart from page zero */
8178 if (this >= server.vm_pages) {
8179 this -= server.vm_pages;
8180 if (this == 0) {
8181 /* Just overflowed, what we found on tail is no longer
8182 * interesting, as it's no longer contiguous. */
8183 numfree = 0;
8184 }
8185 }
8186 if (vmFreePage(this)) {
8187 /* This is a free page */
8188 numfree++;
8189 /* Already got N free pages? Return to the caller, with success */
8190 if (numfree == n) {
8191 *first = this-(n-1);
8192 server.vm_next_page = this+1;
8193 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8194 return REDIS_OK;
8195 }
8196 } else {
8197 /* The current one is not a free page */
8198 numfree = 0;
8199 }
8200
8201 /* Fast-forward if the current page is not free and we already
8202 * searched enough near this place. */
8203 since_jump++;
8204 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8205 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8206 since_jump = 0;
8207 /* Note that even if we rewind after the jump, we are don't need
8208 * to make sure numfree is set to zero as we only jump *if* it
8209 * is set to zero. */
8210 } else {
8211 /* Otherwise just check the next page */
8212 offset++;
8213 }
8214 }
8215 return REDIS_ERR;
8216 }
8217
8218 /* Write the specified object at the specified page of the swap file */
8219 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8220 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8221 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8222 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8223 redisLog(REDIS_WARNING,
8224 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8225 strerror(errno));
8226 return REDIS_ERR;
8227 }
8228 rdbSaveObject(server.vm_fp,o);
8229 fflush(server.vm_fp);
8230 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8231 return REDIS_OK;
8232 }
8233
8234 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8235 * needed to later retrieve the object into the key object.
8236 * If we can't find enough contiguous empty pages to swap the object on disk
8237 * REDIS_ERR is returned. */
8238 static int vmSwapObjectBlocking(robj *key, robj *val) {
8239 off_t pages = rdbSavedObjectPages(val,NULL);
8240 off_t page;
8241
8242 assert(key->storage == REDIS_VM_MEMORY);
8243 assert(key->refcount == 1);
8244 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8245 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8246 key->vm.page = page;
8247 key->vm.usedpages = pages;
8248 key->storage = REDIS_VM_SWAPPED;
8249 key->vtype = val->type;
8250 decrRefCount(val); /* Deallocate the object from memory. */
8251 vmMarkPagesUsed(page,pages);
8252 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8253 (unsigned char*) key->ptr,
8254 (unsigned long long) page, (unsigned long long) pages);
8255 server.vm_stats_swapped_objects++;
8256 server.vm_stats_swapouts++;
8257 return REDIS_OK;
8258 }
8259
8260 static robj *vmReadObjectFromSwap(off_t page, int type) {
8261 robj *o;
8262
8263 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8264 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8265 redisLog(REDIS_WARNING,
8266 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8267 strerror(errno));
8268 _exit(1);
8269 }
8270 o = rdbLoadObject(type,server.vm_fp);
8271 if (o == NULL) {
8272 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8273 _exit(1);
8274 }
8275 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8276 return o;
8277 }
8278
8279 /* Load the value object relative to the 'key' object from swap to memory.
8280 * The newly allocated object is returned.
8281 *
8282 * If preview is true the unserialized object is returned to the caller but
8283 * no changes are made to the key object, nor the pages are marked as freed */
8284 static robj *vmGenericLoadObject(robj *key, int preview) {
8285 robj *val;
8286
8287 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8288 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8289 if (!preview) {
8290 key->storage = REDIS_VM_MEMORY;
8291 key->vm.atime = server.unixtime;
8292 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8293 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8294 (unsigned char*) key->ptr);
8295 server.vm_stats_swapped_objects--;
8296 } else {
8297 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8298 (unsigned char*) key->ptr);
8299 }
8300 server.vm_stats_swapins++;
8301 return val;
8302 }
8303
8304 /* Plain object loading, from swap to memory */
8305 static robj *vmLoadObject(robj *key) {
8306 /* If we are loading the object in background, stop it, we
8307 * need to load this object synchronously ASAP. */
8308 if (key->storage == REDIS_VM_LOADING)
8309 vmCancelThreadedIOJob(key);
8310 return vmGenericLoadObject(key,0);
8311 }
8312
8313 /* Just load the value on disk, without to modify the key.
8314 * This is useful when we want to perform some operation on the value
8315 * without to really bring it from swap to memory, like while saving the
8316 * dataset or rewriting the append only log. */
8317 static robj *vmPreviewObject(robj *key) {
8318 return vmGenericLoadObject(key,1);
8319 }
8320
8321 /* How a good candidate is this object for swapping?
8322 * The better candidate it is, the greater the returned value.
8323 *
8324 * Currently we try to perform a fast estimation of the object size in
8325 * memory, and combine it with aging informations.
8326 *
8327 * Basically swappability = idle-time * log(estimated size)
8328 *
8329 * Bigger objects are preferred over smaller objects, but not
8330 * proportionally, this is why we use the logarithm. This algorithm is
8331 * just a first try and will probably be tuned later. */
8332 static double computeObjectSwappability(robj *o) {
8333 time_t age = server.unixtime - o->vm.atime;
8334 long asize = 0;
8335 list *l;
8336 dict *d;
8337 struct dictEntry *de;
8338 int z;
8339
8340 if (age <= 0) return 0;
8341 switch(o->type) {
8342 case REDIS_STRING:
8343 if (o->encoding != REDIS_ENCODING_RAW) {
8344 asize = sizeof(*o);
8345 } else {
8346 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8347 }
8348 break;
8349 case REDIS_LIST:
8350 l = o->ptr;
8351 listNode *ln = listFirst(l);
8352
8353 asize = sizeof(list);
8354 if (ln) {
8355 robj *ele = ln->value;
8356 long elesize;
8357
8358 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8359 (sizeof(*o)+sdslen(ele->ptr)) :
8360 sizeof(*o);
8361 asize += (sizeof(listNode)+elesize)*listLength(l);
8362 }
8363 break;
8364 case REDIS_SET:
8365 case REDIS_ZSET:
8366 z = (o->type == REDIS_ZSET);
8367 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8368
8369 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8370 if (z) asize += sizeof(zset)-sizeof(dict);
8371 if (dictSize(d)) {
8372 long elesize;
8373 robj *ele;
8374
8375 de = dictGetRandomKey(d);
8376 ele = dictGetEntryKey(de);
8377 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8378 (sizeof(*o)+sdslen(ele->ptr)) :
8379 sizeof(*o);
8380 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8381 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8382 }
8383 break;
8384 case REDIS_HASH:
8385 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8386 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8387 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8388 unsigned int klen, vlen;
8389 unsigned char *key, *val;
8390
8391 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8392 klen = 0;
8393 vlen = 0;
8394 }
8395 asize = len*(klen+vlen+3);
8396 } else if (o->encoding == REDIS_ENCODING_HT) {
8397 d = o->ptr;
8398 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8399 if (dictSize(d)) {
8400 long elesize;
8401 robj *ele;
8402
8403 de = dictGetRandomKey(d);
8404 ele = dictGetEntryKey(de);
8405 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8406 (sizeof(*o)+sdslen(ele->ptr)) :
8407 sizeof(*o);
8408 ele = dictGetEntryVal(de);
8409 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8410 (sizeof(*o)+sdslen(ele->ptr)) :
8411 sizeof(*o);
8412 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8413 }
8414 }
8415 break;
8416 }
8417 return (double)age*log(1+asize);
8418 }
8419
8420 /* Try to swap an object that's a good candidate for swapping.
8421 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8422 * to swap any object at all.
8423 *
8424 * If 'usethreaded' is true, Redis will try to swap the object in background
8425 * using I/O threads. */
8426 static int vmSwapOneObject(int usethreads) {
8427 int j, i;
8428 struct dictEntry *best = NULL;
8429 double best_swappability = 0;
8430 redisDb *best_db = NULL;
8431 robj *key, *val;
8432
8433 for (j = 0; j < server.dbnum; j++) {
8434 redisDb *db = server.db+j;
8435 /* Why maxtries is set to 100?
8436 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8437 * are swappable objects */
8438 int maxtries = 100;
8439
8440 if (dictSize(db->dict) == 0) continue;
8441 for (i = 0; i < 5; i++) {
8442 dictEntry *de;
8443 double swappability;
8444
8445 if (maxtries) maxtries--;
8446 de = dictGetRandomKey(db->dict);
8447 key = dictGetEntryKey(de);
8448 val = dictGetEntryVal(de);
8449 /* Only swap objects that are currently in memory.
8450 *
8451 * Also don't swap shared objects if threaded VM is on, as we
8452 * try to ensure that the main thread does not touch the
8453 * object while the I/O thread is using it, but we can't
8454 * control other keys without adding additional mutex. */
8455 if (key->storage != REDIS_VM_MEMORY ||
8456 (server.vm_max_threads != 0 && val->refcount != 1)) {
8457 if (maxtries) i--; /* don't count this try */
8458 continue;
8459 }
8460 swappability = computeObjectSwappability(val);
8461 if (!best || swappability > best_swappability) {
8462 best = de;
8463 best_swappability = swappability;
8464 best_db = db;
8465 }
8466 }
8467 }
8468 if (best == NULL) return REDIS_ERR;
8469 key = dictGetEntryKey(best);
8470 val = dictGetEntryVal(best);
8471
8472 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8473 key->ptr, best_swappability);
8474
8475 /* Unshare the key if needed */
8476 if (key->refcount > 1) {
8477 robj *newkey = dupStringObject(key);
8478 decrRefCount(key);
8479 key = dictGetEntryKey(best) = newkey;
8480 }
8481 /* Swap it */
8482 if (usethreads) {
8483 vmSwapObjectThreaded(key,val,best_db);
8484 return REDIS_OK;
8485 } else {
8486 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8487 dictGetEntryVal(best) = NULL;
8488 return REDIS_OK;
8489 } else {
8490 return REDIS_ERR;
8491 }
8492 }
8493 }
8494
8495 static int vmSwapOneObjectBlocking() {
8496 return vmSwapOneObject(0);
8497 }
8498
8499 static int vmSwapOneObjectThreaded() {
8500 return vmSwapOneObject(1);
8501 }
8502
8503 /* Return true if it's safe to swap out objects in a given moment.
8504 * Basically we don't want to swap objects out while there is a BGSAVE
8505 * or a BGAEOREWRITE running in backgroud. */
8506 static int vmCanSwapOut(void) {
8507 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8508 }
8509
8510 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8511 * and was deleted. Otherwise 0 is returned. */
8512 static int deleteIfSwapped(redisDb *db, robj *key) {
8513 dictEntry *de;
8514 robj *foundkey;
8515
8516 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8517 foundkey = dictGetEntryKey(de);
8518 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8519 deleteKey(db,key);
8520 return 1;
8521 }
8522
8523 /* =================== Virtual Memory - Threaded I/O ======================= */
8524
8525 static void freeIOJob(iojob *j) {
8526 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8527 j->type == REDIS_IOJOB_DO_SWAP ||
8528 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8529 decrRefCount(j->val);
8530 decrRefCount(j->key);
8531 zfree(j);
8532 }
8533
8534 /* Every time a thread finished a Job, it writes a byte into the write side
8535 * of an unix pipe in order to "awake" the main thread, and this function
8536 * is called. */
8537 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8538 int mask)
8539 {
8540 char buf[1];
8541 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8542 REDIS_NOTUSED(el);
8543 REDIS_NOTUSED(mask);
8544 REDIS_NOTUSED(privdata);
8545
8546 /* For every byte we read in the read side of the pipe, there is one
8547 * I/O job completed to process. */
8548 while((retval = read(fd,buf,1)) == 1) {
8549 iojob *j;
8550 listNode *ln;
8551 robj *key;
8552 struct dictEntry *de;
8553
8554 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8555
8556 /* Get the processed element (the oldest one) */
8557 lockThreadedIO();
8558 assert(listLength(server.io_processed) != 0);
8559 if (toprocess == -1) {
8560 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8561 if (toprocess <= 0) toprocess = 1;
8562 }
8563 ln = listFirst(server.io_processed);
8564 j = ln->value;
8565 listDelNode(server.io_processed,ln);
8566 unlockThreadedIO();
8567 /* If this job is marked as canceled, just ignore it */
8568 if (j->canceled) {
8569 freeIOJob(j);
8570 continue;
8571 }
8572 /* Post process it in the main thread, as there are things we
8573 * can do just here to avoid race conditions and/or invasive locks */
8574 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8575 de = dictFind(j->db->dict,j->key);
8576 assert(de != NULL);
8577 key = dictGetEntryKey(de);
8578 if (j->type == REDIS_IOJOB_LOAD) {
8579 redisDb *db;
8580
8581 /* Key loaded, bring it at home */
8582 key->storage = REDIS_VM_MEMORY;
8583 key->vm.atime = server.unixtime;
8584 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8585 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8586 (unsigned char*) key->ptr);
8587 server.vm_stats_swapped_objects--;
8588 server.vm_stats_swapins++;
8589 dictGetEntryVal(de) = j->val;
8590 incrRefCount(j->val);
8591 db = j->db;
8592 freeIOJob(j);
8593 /* Handle clients waiting for this key to be loaded. */
8594 handleClientsBlockedOnSwappedKey(db,key);
8595 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8596 /* Now we know the amount of pages required to swap this object.
8597 * Let's find some space for it, and queue this task again
8598 * rebranded as REDIS_IOJOB_DO_SWAP. */
8599 if (!vmCanSwapOut() ||
8600 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8601 {
8602 /* Ooops... no space or we can't swap as there is
8603 * a fork()ed Redis trying to save stuff on disk. */
8604 freeIOJob(j);
8605 key->storage = REDIS_VM_MEMORY; /* undo operation */
8606 } else {
8607 /* Note that we need to mark this pages as used now,
8608 * if the job will be canceled, we'll mark them as freed
8609 * again. */
8610 vmMarkPagesUsed(j->page,j->pages);
8611 j->type = REDIS_IOJOB_DO_SWAP;
8612 lockThreadedIO();
8613 queueIOJob(j);
8614 unlockThreadedIO();
8615 }
8616 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8617 robj *val;
8618
8619 /* Key swapped. We can finally free some memory. */
8620 if (key->storage != REDIS_VM_SWAPPING) {
8621 printf("key->storage: %d\n",key->storage);
8622 printf("key->name: %s\n",(char*)key->ptr);
8623 printf("key->refcount: %d\n",key->refcount);
8624 printf("val: %p\n",(void*)j->val);
8625 printf("val->type: %d\n",j->val->type);
8626 printf("val->ptr: %s\n",(char*)j->val->ptr);
8627 }
8628 redisAssert(key->storage == REDIS_VM_SWAPPING);
8629 val = dictGetEntryVal(de);
8630 key->vm.page = j->page;
8631 key->vm.usedpages = j->pages;
8632 key->storage = REDIS_VM_SWAPPED;
8633 key->vtype = j->val->type;
8634 decrRefCount(val); /* Deallocate the object from memory. */
8635 dictGetEntryVal(de) = NULL;
8636 redisLog(REDIS_DEBUG,
8637 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8638 (unsigned char*) key->ptr,
8639 (unsigned long long) j->page, (unsigned long long) j->pages);
8640 server.vm_stats_swapped_objects++;
8641 server.vm_stats_swapouts++;
8642 freeIOJob(j);
8643 /* Put a few more swap requests in queue if we are still
8644 * out of memory */
8645 if (trytoswap && vmCanSwapOut() &&
8646 zmalloc_used_memory() > server.vm_max_memory)
8647 {
8648 int more = 1;
8649 while(more) {
8650 lockThreadedIO();
8651 more = listLength(server.io_newjobs) <
8652 (unsigned) server.vm_max_threads;
8653 unlockThreadedIO();
8654 /* Don't waste CPU time if swappable objects are rare. */
8655 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8656 trytoswap = 0;
8657 break;
8658 }
8659 }
8660 }
8661 }
8662 processed++;
8663 if (processed == toprocess) return;
8664 }
8665 if (retval < 0 && errno != EAGAIN) {
8666 redisLog(REDIS_WARNING,
8667 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8668 strerror(errno));
8669 }
8670 }
8671
8672 static void lockThreadedIO(void) {
8673 pthread_mutex_lock(&server.io_mutex);
8674 }
8675
8676 static void unlockThreadedIO(void) {
8677 pthread_mutex_unlock(&server.io_mutex);
8678 }
8679
8680 /* Remove the specified object from the threaded I/O queue if still not
8681 * processed, otherwise make sure to flag it as canceled. */
8682 static void vmCancelThreadedIOJob(robj *o) {
8683 list *lists[3] = {
8684 server.io_newjobs, /* 0 */
8685 server.io_processing, /* 1 */
8686 server.io_processed /* 2 */
8687 };
8688 int i;
8689
8690 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8691 again:
8692 lockThreadedIO();
8693 /* Search for a matching key in one of the queues */
8694 for (i = 0; i < 3; i++) {
8695 listNode *ln;
8696 listIter li;
8697
8698 listRewind(lists[i],&li);
8699 while ((ln = listNext(&li)) != NULL) {
8700 iojob *job = ln->value;
8701
8702 if (job->canceled) continue; /* Skip this, already canceled. */
8703 if (compareStringObjects(job->key,o) == 0) {
8704 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8705 (void*)job, (char*)o->ptr, job->type, i);
8706 /* Mark the pages as free since the swap didn't happened
8707 * or happened but is now discarded. */
8708 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8709 vmMarkPagesFree(job->page,job->pages);
8710 /* Cancel the job. It depends on the list the job is
8711 * living in. */
8712 switch(i) {
8713 case 0: /* io_newjobs */
8714 /* If the job was yet not processed the best thing to do
8715 * is to remove it from the queue at all */
8716 freeIOJob(job);
8717 listDelNode(lists[i],ln);
8718 break;
8719 case 1: /* io_processing */
8720 /* Oh Shi- the thread is messing with the Job:
8721 *
8722 * Probably it's accessing the object if this is a
8723 * PREPARE_SWAP or DO_SWAP job.
8724 * If it's a LOAD job it may be reading from disk and
8725 * if we don't wait for the job to terminate before to
8726 * cancel it, maybe in a few microseconds data can be
8727 * corrupted in this pages. So the short story is:
8728 *
8729 * Better to wait for the job to move into the
8730 * next queue (processed)... */
8731
8732 /* We try again and again until the job is completed. */
8733 unlockThreadedIO();
8734 /* But let's wait some time for the I/O thread
8735 * to finish with this job. After all this condition
8736 * should be very rare. */
8737 usleep(1);
8738 goto again;
8739 case 2: /* io_processed */
8740 /* The job was already processed, that's easy...
8741 * just mark it as canceled so that we'll ignore it
8742 * when processing completed jobs. */
8743 job->canceled = 1;
8744 break;
8745 }
8746 /* Finally we have to adjust the storage type of the object
8747 * in order to "UNDO" the operaiton. */
8748 if (o->storage == REDIS_VM_LOADING)
8749 o->storage = REDIS_VM_SWAPPED;
8750 else if (o->storage == REDIS_VM_SWAPPING)
8751 o->storage = REDIS_VM_MEMORY;
8752 unlockThreadedIO();
8753 return;
8754 }
8755 }
8756 }
8757 unlockThreadedIO();
8758 assert(1 != 1); /* We should never reach this */
8759 }
8760
8761 static void *IOThreadEntryPoint(void *arg) {
8762 iojob *j;
8763 listNode *ln;
8764 REDIS_NOTUSED(arg);
8765
8766 pthread_detach(pthread_self());
8767 while(1) {
8768 /* Get a new job to process */
8769 lockThreadedIO();
8770 if (listLength(server.io_newjobs) == 0) {
8771 /* No new jobs in queue, exit. */
8772 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8773 (long) pthread_self());
8774 server.io_active_threads--;
8775 unlockThreadedIO();
8776 return NULL;
8777 }
8778 ln = listFirst(server.io_newjobs);
8779 j = ln->value;
8780 listDelNode(server.io_newjobs,ln);
8781 /* Add the job in the processing queue */
8782 j->thread = pthread_self();
8783 listAddNodeTail(server.io_processing,j);
8784 ln = listLast(server.io_processing); /* We use ln later to remove it */
8785 unlockThreadedIO();
8786 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8787 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8788
8789 /* Process the Job */
8790 if (j->type == REDIS_IOJOB_LOAD) {
8791 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8792 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8793 FILE *fp = fopen("/dev/null","w+");
8794 j->pages = rdbSavedObjectPages(j->val,fp);
8795 fclose(fp);
8796 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8797 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8798 j->canceled = 1;
8799 }
8800
8801 /* Done: insert the job into the processed queue */
8802 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8803 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8804 lockThreadedIO();
8805 listDelNode(server.io_processing,ln);
8806 listAddNodeTail(server.io_processed,j);
8807 unlockThreadedIO();
8808
8809 /* Signal the main thread there is new stuff to process */
8810 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8811 }
8812 return NULL; /* never reached */
8813 }
8814
8815 static void spawnIOThread(void) {
8816 pthread_t thread;
8817 sigset_t mask, omask;
8818 int err;
8819
8820 sigemptyset(&mask);
8821 sigaddset(&mask,SIGCHLD);
8822 sigaddset(&mask,SIGHUP);
8823 sigaddset(&mask,SIGPIPE);
8824 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8825 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8826 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8827 strerror(err));
8828 usleep(1000000);
8829 }
8830 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8831 server.io_active_threads++;
8832 }
8833
8834 /* We need to wait for the last thread to exit before we are able to
8835 * fork() in order to BGSAVE or BGREWRITEAOF. */
8836 static void waitEmptyIOJobsQueue(void) {
8837 while(1) {
8838 int io_processed_len;
8839
8840 lockThreadedIO();
8841 if (listLength(server.io_newjobs) == 0 &&
8842 listLength(server.io_processing) == 0 &&
8843 server.io_active_threads == 0)
8844 {
8845 unlockThreadedIO();
8846 return;
8847 }
8848 /* While waiting for empty jobs queue condition we post-process some
8849 * finshed job, as I/O threads may be hanging trying to write against
8850 * the io_ready_pipe_write FD but there are so much pending jobs that
8851 * it's blocking. */
8852 io_processed_len = listLength(server.io_processed);
8853 unlockThreadedIO();
8854 if (io_processed_len) {
8855 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8856 usleep(1000); /* 1 millisecond */
8857 } else {
8858 usleep(10000); /* 10 milliseconds */
8859 }
8860 }
8861 }
8862
8863 static void vmReopenSwapFile(void) {
8864 /* Note: we don't close the old one as we are in the child process
8865 * and don't want to mess at all with the original file object. */
8866 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8867 if (server.vm_fp == NULL) {
8868 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8869 server.vm_swap_file);
8870 _exit(1);
8871 }
8872 server.vm_fd = fileno(server.vm_fp);
8873 }
8874
8875 /* This function must be called while with threaded IO locked */
8876 static void queueIOJob(iojob *j) {
8877 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8878 (void*)j, j->type, (char*)j->key->ptr);
8879 listAddNodeTail(server.io_newjobs,j);
8880 if (server.io_active_threads < server.vm_max_threads)
8881 spawnIOThread();
8882 }
8883
8884 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8885 iojob *j;
8886
8887 assert(key->storage == REDIS_VM_MEMORY);
8888 assert(key->refcount == 1);
8889
8890 j = zmalloc(sizeof(*j));
8891 j->type = REDIS_IOJOB_PREPARE_SWAP;
8892 j->db = db;
8893 j->key = dupStringObject(key);
8894 j->val = val;
8895 incrRefCount(val);
8896 j->canceled = 0;
8897 j->thread = (pthread_t) -1;
8898 key->storage = REDIS_VM_SWAPPING;
8899
8900 lockThreadedIO();
8901 queueIOJob(j);
8902 unlockThreadedIO();
8903 return REDIS_OK;
8904 }
8905
8906 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8907
8908 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8909 * If there is not already a job loading the key, it is craeted.
8910 * The key is added to the io_keys list in the client structure, and also
8911 * in the hash table mapping swapped keys to waiting clients, that is,
8912 * server.io_waited_keys. */
8913 static int waitForSwappedKey(redisClient *c, robj *key) {
8914 struct dictEntry *de;
8915 robj *o;
8916 list *l;
8917
8918 /* If the key does not exist or is already in RAM we don't need to
8919 * block the client at all. */
8920 de = dictFind(c->db->dict,key);
8921 if (de == NULL) return 0;
8922 o = dictGetEntryKey(de);
8923 if (o->storage == REDIS_VM_MEMORY) {
8924 return 0;
8925 } else if (o->storage == REDIS_VM_SWAPPING) {
8926 /* We were swapping the key, undo it! */
8927 vmCancelThreadedIOJob(o);
8928 return 0;
8929 }
8930
8931 /* OK: the key is either swapped, or being loaded just now. */
8932
8933 /* Add the key to the list of keys this client is waiting for.
8934 * This maps clients to keys they are waiting for. */
8935 listAddNodeTail(c->io_keys,key);
8936 incrRefCount(key);
8937
8938 /* Add the client to the swapped keys => clients waiting map. */
8939 de = dictFind(c->db->io_keys,key);
8940 if (de == NULL) {
8941 int retval;
8942
8943 /* For every key we take a list of clients blocked for it */
8944 l = listCreate();
8945 retval = dictAdd(c->db->io_keys,key,l);
8946 incrRefCount(key);
8947 assert(retval == DICT_OK);
8948 } else {
8949 l = dictGetEntryVal(de);
8950 }
8951 listAddNodeTail(l,c);
8952
8953 /* Are we already loading the key from disk? If not create a job */
8954 if (o->storage == REDIS_VM_SWAPPED) {
8955 iojob *j;
8956
8957 o->storage = REDIS_VM_LOADING;
8958 j = zmalloc(sizeof(*j));
8959 j->type = REDIS_IOJOB_LOAD;
8960 j->db = c->db;
8961 j->key = dupStringObject(key);
8962 j->key->vtype = o->vtype;
8963 j->page = o->vm.page;
8964 j->val = NULL;
8965 j->canceled = 0;
8966 j->thread = (pthread_t) -1;
8967 lockThreadedIO();
8968 queueIOJob(j);
8969 unlockThreadedIO();
8970 }
8971 return 1;
8972 }
8973
8974 /* Preload keys needed for the ZUNION and ZINTER commands. */
8975 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
8976 int i, num;
8977 num = atoi(c->argv[2]->ptr);
8978 for (i = 0; i < num; i++) {
8979 waitForSwappedKey(c,c->argv[3+i]);
8980 }
8981 }
8982
8983 /* Is this client attempting to run a command against swapped keys?
8984 * If so, block it ASAP, load the keys in background, then resume it.
8985 *
8986 * The important idea about this function is that it can fail! If keys will
8987 * still be swapped when the client is resumed, this key lookups will
8988 * just block loading keys from disk. In practical terms this should only
8989 * happen with SORT BY command or if there is a bug in this function.
8990 *
8991 * Return 1 if the client is marked as blocked, 0 if the client can
8992 * continue as the keys it is going to access appear to be in memory. */
8993 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8994 int j, last;
8995
8996 if (cmd->vm_preload_proc != NULL) {
8997 cmd->vm_preload_proc(c);
8998 } else {
8999 if (cmd->vm_firstkey == 0) return 0;
9000 last = cmd->vm_lastkey;
9001 if (last < 0) last = c->argc+last;
9002 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9003 waitForSwappedKey(c,c->argv[j]);
9004 }
9005
9006 /* If the client was blocked for at least one key, mark it as blocked. */
9007 if (listLength(c->io_keys)) {
9008 c->flags |= REDIS_IO_WAIT;
9009 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9010 server.vm_blocked_clients++;
9011 return 1;
9012 } else {
9013 return 0;
9014 }
9015 }
9016
9017 /* Remove the 'key' from the list of blocked keys for a given client.
9018 *
9019 * The function returns 1 when there are no longer blocking keys after
9020 * the current one was removed (and the client can be unblocked). */
9021 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9022 list *l;
9023 listNode *ln;
9024 listIter li;
9025 struct dictEntry *de;
9026
9027 /* Remove the key from the list of keys this client is waiting for. */
9028 listRewind(c->io_keys,&li);
9029 while ((ln = listNext(&li)) != NULL) {
9030 if (compareStringObjects(ln->value,key) == 0) {
9031 listDelNode(c->io_keys,ln);
9032 break;
9033 }
9034 }
9035 assert(ln != NULL);
9036
9037 /* Remove the client form the key => waiting clients map. */
9038 de = dictFind(c->db->io_keys,key);
9039 assert(de != NULL);
9040 l = dictGetEntryVal(de);
9041 ln = listSearchKey(l,c);
9042 assert(ln != NULL);
9043 listDelNode(l,ln);
9044 if (listLength(l) == 0)
9045 dictDelete(c->db->io_keys,key);
9046
9047 return listLength(c->io_keys) == 0;
9048 }
9049
9050 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9051 struct dictEntry *de;
9052 list *l;
9053 listNode *ln;
9054 int len;
9055
9056 de = dictFind(db->io_keys,key);
9057 if (!de) return;
9058
9059 l = dictGetEntryVal(de);
9060 len = listLength(l);
9061 /* Note: we can't use something like while(listLength(l)) as the list
9062 * can be freed by the calling function when we remove the last element. */
9063 while (len--) {
9064 ln = listFirst(l);
9065 redisClient *c = ln->value;
9066
9067 if (dontWaitForSwappedKey(c,key)) {
9068 /* Put the client in the list of clients ready to go as we
9069 * loaded all the keys about it. */
9070 listAddNodeTail(server.io_ready_clients,c);
9071 }
9072 }
9073 }
9074
9075 /* =========================== Remote Configuration ========================= */
9076
9077 static void configSetCommand(redisClient *c) {
9078 robj *o = getDecodedObject(c->argv[3]);
9079 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9080 zfree(server.dbfilename);
9081 server.dbfilename = zstrdup(o->ptr);
9082 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9083 zfree(server.requirepass);
9084 server.requirepass = zstrdup(o->ptr);
9085 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9086 zfree(server.masterauth);
9087 server.masterauth = zstrdup(o->ptr);
9088 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9089 server.maxmemory = strtoll(o->ptr, NULL, 10);
9090 } else {
9091 addReplySds(c,sdscatprintf(sdsempty(),
9092 "-ERR not supported CONFIG parameter %s\r\n",
9093 (char*)c->argv[2]->ptr));
9094 decrRefCount(o);
9095 return;
9096 }
9097 decrRefCount(o);
9098 addReply(c,shared.ok);
9099 }
9100
9101 static void configGetCommand(redisClient *c) {
9102 robj *o = getDecodedObject(c->argv[2]);
9103 robj *lenobj = createObject(REDIS_STRING,NULL);
9104 char *pattern = o->ptr;
9105 int matches = 0;
9106
9107 addReply(c,lenobj);
9108 decrRefCount(lenobj);
9109
9110 if (stringmatch(pattern,"dbfilename",0)) {
9111 addReplyBulkCString(c,"dbfilename");
9112 addReplyBulkCString(c,server.dbfilename);
9113 matches++;
9114 }
9115 if (stringmatch(pattern,"requirepass",0)) {
9116 addReplyBulkCString(c,"requirepass");
9117 addReplyBulkCString(c,server.requirepass);
9118 matches++;
9119 }
9120 if (stringmatch(pattern,"masterauth",0)) {
9121 addReplyBulkCString(c,"masterauth");
9122 addReplyBulkCString(c,server.masterauth);
9123 matches++;
9124 }
9125 if (stringmatch(pattern,"maxmemory",0)) {
9126 char buf[128];
9127
9128 snprintf(buf,128,"%llu\n",server.maxmemory);
9129 addReplyBulkCString(c,"maxmemory");
9130 addReplyBulkCString(c,buf);
9131 matches++;
9132 }
9133 decrRefCount(o);
9134 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9135 }
9136
9137 static void configCommand(redisClient *c) {
9138 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9139 if (c->argc != 4) goto badarity;
9140 configSetCommand(c);
9141 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9142 if (c->argc != 3) goto badarity;
9143 configGetCommand(c);
9144 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9145 if (c->argc != 2) goto badarity;
9146 server.stat_numcommands = 0;
9147 server.stat_numconnections = 0;
9148 server.stat_expiredkeys = 0;
9149 server.stat_starttime = time(NULL);
9150 addReply(c,shared.ok);
9151 } else {
9152 addReplySds(c,sdscatprintf(sdsempty(),
9153 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9154 }
9155 return;
9156
9157 badarity:
9158 addReplySds(c,sdscatprintf(sdsempty(),
9159 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9160 (char*) c->argv[1]->ptr));
9161 }
9162
9163 /* ================================= Debugging ============================== */
9164
9165 static void debugCommand(redisClient *c) {
9166 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9167 *((char*)-1) = 'x';
9168 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9169 if (rdbSave(server.dbfilename) != REDIS_OK) {
9170 addReply(c,shared.err);
9171 return;
9172 }
9173 emptyDb();
9174 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9175 addReply(c,shared.err);
9176 return;
9177 }
9178 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9179 addReply(c,shared.ok);
9180 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9181 emptyDb();
9182 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9183 addReply(c,shared.err);
9184 return;
9185 }
9186 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9187 addReply(c,shared.ok);
9188 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9189 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9190 robj *key, *val;
9191
9192 if (!de) {
9193 addReply(c,shared.nokeyerr);
9194 return;
9195 }
9196 key = dictGetEntryKey(de);
9197 val = dictGetEntryVal(de);
9198 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9199 key->storage == REDIS_VM_SWAPPING)) {
9200 char *strenc;
9201 char buf[128];
9202
9203 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9204 strenc = strencoding[val->encoding];
9205 } else {
9206 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9207 strenc = buf;
9208 }
9209 addReplySds(c,sdscatprintf(sdsempty(),
9210 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9211 "encoding:%s serializedlength:%lld\r\n",
9212 (void*)key, key->refcount, (void*)val, val->refcount,
9213 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9214 } else {
9215 addReplySds(c,sdscatprintf(sdsempty(),
9216 "+Key at:%p refcount:%d, value swapped at: page %llu "
9217 "using %llu pages\r\n",
9218 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9219 (unsigned long long) key->vm.usedpages));
9220 }
9221 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9222 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9223 robj *key, *val;
9224
9225 if (!server.vm_enabled) {
9226 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9227 return;
9228 }
9229 if (!de) {
9230 addReply(c,shared.nokeyerr);
9231 return;
9232 }
9233 key = dictGetEntryKey(de);
9234 val = dictGetEntryVal(de);
9235 /* If the key is shared we want to create a copy */
9236 if (key->refcount > 1) {
9237 robj *newkey = dupStringObject(key);
9238 decrRefCount(key);
9239 key = dictGetEntryKey(de) = newkey;
9240 }
9241 /* Swap it */
9242 if (key->storage != REDIS_VM_MEMORY) {
9243 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9244 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9245 dictGetEntryVal(de) = NULL;
9246 addReply(c,shared.ok);
9247 } else {
9248 addReply(c,shared.err);
9249 }
9250 } else {
9251 addReplySds(c,sdsnew(
9252 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
9253 }
9254 }
9255
9256 static void _redisAssert(char *estr, char *file, int line) {
9257 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9258 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9259 #ifdef HAVE_BACKTRACE
9260 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9261 *((char*)-1) = 'x';
9262 #endif
9263 }
9264
9265 /* =================================== Main! ================================ */
9266
9267 #ifdef __linux__
9268 int linuxOvercommitMemoryValue(void) {
9269 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9270 char buf[64];
9271
9272 if (!fp) return -1;
9273 if (fgets(buf,64,fp) == NULL) {
9274 fclose(fp);
9275 return -1;
9276 }
9277 fclose(fp);
9278
9279 return atoi(buf);
9280 }
9281
9282 void linuxOvercommitMemoryWarning(void) {
9283 if (linuxOvercommitMemoryValue() == 0) {
9284 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9285 }
9286 }
9287 #endif /* __linux__ */
9288
9289 static void daemonize(void) {
9290 int fd;
9291 FILE *fp;
9292
9293 if (fork() != 0) exit(0); /* parent exits */
9294 setsid(); /* create a new session */
9295
9296 /* Every output goes to /dev/null. If Redis is daemonized but
9297 * the 'logfile' is set to 'stdout' in the configuration file
9298 * it will not log at all. */
9299 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9300 dup2(fd, STDIN_FILENO);
9301 dup2(fd, STDOUT_FILENO);
9302 dup2(fd, STDERR_FILENO);
9303 if (fd > STDERR_FILENO) close(fd);
9304 }
9305 /* Try to write the pid file */
9306 fp = fopen(server.pidfile,"w");
9307 if (fp) {
9308 fprintf(fp,"%d\n",getpid());
9309 fclose(fp);
9310 }
9311 }
9312
9313 static void version() {
9314 printf("Redis server version %s\n", REDIS_VERSION);
9315 exit(0);
9316 }
9317
9318 static void usage() {
9319 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9320 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9321 exit(1);
9322 }
9323
9324 int main(int argc, char **argv) {
9325 time_t start;
9326
9327 initServerConfig();
9328 if (argc == 2) {
9329 if (strcmp(argv[1], "-v") == 0 ||
9330 strcmp(argv[1], "--version") == 0) version();
9331 if (strcmp(argv[1], "--help") == 0) usage();
9332 resetServerSaveParams();
9333 loadServerConfig(argv[1]);
9334 } else if ((argc > 2)) {
9335 usage();
9336 } else {
9337 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9338 }
9339 if (server.daemonize) daemonize();
9340 initServer();
9341 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9342 #ifdef __linux__
9343 linuxOvercommitMemoryWarning();
9344 #endif
9345 start = time(NULL);
9346 if (server.appendonly) {
9347 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9348 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9349 } else {
9350 if (rdbLoad(server.dbfilename) == REDIS_OK)
9351 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9352 }
9353 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9354 aeSetBeforeSleepProc(server.el,beforeSleep);
9355 aeMain(server.el);
9356 aeDeleteEventLoop(server.el);
9357 return 0;
9358 }
9359
9360 /* ============================= Backtrace support ========================= */
9361
9362 #ifdef HAVE_BACKTRACE
9363 static char *findFuncName(void *pointer, unsigned long *offset);
9364
9365 static void *getMcontextEip(ucontext_t *uc) {
9366 #if defined(__FreeBSD__)
9367 return (void*) uc->uc_mcontext.mc_eip;
9368 #elif defined(__dietlibc__)
9369 return (void*) uc->uc_mcontext.eip;
9370 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9371 #if __x86_64__
9372 return (void*) uc->uc_mcontext->__ss.__rip;
9373 #else
9374 return (void*) uc->uc_mcontext->__ss.__eip;
9375 #endif
9376 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9377 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9378 return (void*) uc->uc_mcontext->__ss.__rip;
9379 #else
9380 return (void*) uc->uc_mcontext->__ss.__eip;
9381 #endif
9382 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9383 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9384 #elif defined(__ia64__) /* Linux IA64 */
9385 return (void*) uc->uc_mcontext.sc_ip;
9386 #else
9387 return NULL;
9388 #endif
9389 }
9390
9391 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9392 void *trace[100];
9393 char **messages = NULL;
9394 int i, trace_size = 0;
9395 unsigned long offset=0;
9396 ucontext_t *uc = (ucontext_t*) secret;
9397 sds infostring;
9398 REDIS_NOTUSED(info);
9399
9400 redisLog(REDIS_WARNING,
9401 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9402 infostring = genRedisInfoString();
9403 redisLog(REDIS_WARNING, "%s",infostring);
9404 /* It's not safe to sdsfree() the returned string under memory
9405 * corruption conditions. Let it leak as we are going to abort */
9406
9407 trace_size = backtrace(trace, 100);
9408 /* overwrite sigaction with caller's address */
9409 if (getMcontextEip(uc) != NULL) {
9410 trace[1] = getMcontextEip(uc);
9411 }
9412 messages = backtrace_symbols(trace, trace_size);
9413
9414 for (i=1; i<trace_size; ++i) {
9415 char *fn = findFuncName(trace[i], &offset), *p;
9416
9417 p = strchr(messages[i],'+');
9418 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9419 redisLog(REDIS_WARNING,"%s", messages[i]);
9420 } else {
9421 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9422 }
9423 }
9424 /* free(messages); Don't call free() with possibly corrupted memory. */
9425 _exit(0);
9426 }
9427
9428 static void setupSigSegvAction(void) {
9429 struct sigaction act;
9430
9431 sigemptyset (&act.sa_mask);
9432 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9433 * is used. Otherwise, sa_handler is used */
9434 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9435 act.sa_sigaction = segvHandler;
9436 sigaction (SIGSEGV, &act, NULL);
9437 sigaction (SIGBUS, &act, NULL);
9438 sigaction (SIGFPE, &act, NULL);
9439 sigaction (SIGILL, &act, NULL);
9440 sigaction (SIGBUS, &act, NULL);
9441 return;
9442 }
9443
9444 #include "staticsymbols.h"
9445 /* This function try to convert a pointer into a function name. It's used in
9446 * oreder to provide a backtrace under segmentation fault that's able to
9447 * display functions declared as static (otherwise the backtrace is useless). */
9448 static char *findFuncName(void *pointer, unsigned long *offset){
9449 int i, ret = -1;
9450 unsigned long off, minoff = 0;
9451
9452 /* Try to match against the Symbol with the smallest offset */
9453 for (i=0; symsTable[i].pointer; i++) {
9454 unsigned long lp = (unsigned long) pointer;
9455
9456 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9457 off=lp-symsTable[i].pointer;
9458 if (ret < 0 || off < minoff) {
9459 minoff=off;
9460 ret=i;
9461 }
9462 }
9463 }
9464 if (ret == -1) return NULL;
9465 *offset = minoff;
9466 return symsTable[ret].name;
9467 }
9468 #else /* HAVE_BACKTRACE */
9469 static void setupSigSegvAction(void) {
9470 }
9471 #endif /* HAVE_BACKTRACE */
9472
9473
9474
9475 /* The End */
9476
9477
9478