]> git.saurik.com Git - redis.git/blob - redis.c
025f9ddac6fd3942a89fd945188359f71b84a2d6
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.7"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132 };
133
134 /* Object types only used for dumping to disk */
135 #define REDIS_EXPIRETIME 253
136 #define REDIS_SELECTDB 254
137 #define REDIS_EOF 255
138
139 /* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
149 *
150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
152 #define REDIS_RDB_6BITLEN 0
153 #define REDIS_RDB_14BITLEN 1
154 #define REDIS_RDB_32BITLEN 2
155 #define REDIS_RDB_ENCVAL 3
156 #define REDIS_RDB_LENERR UINT_MAX
157
158 /* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
164 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
165
166 /* Virtual memory object->where field. */
167 #define REDIS_VM_MEMORY 0 /* The object is on memory */
168 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
169 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
172 /* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174 #define REDIS_VM_MAX_NEAR_PAGES 65536
175 #define REDIS_VM_MAX_RANDOM_JUMP 4096
176 #define REDIS_VM_MAX_THREADS 32
177 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
178 /* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
182 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
183
184 /* Client flags */
185 #define REDIS_SLAVE 1 /* This client is a slave server */
186 #define REDIS_MASTER 2 /* This client is a master server */
187 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188 #define REDIS_MULTI 8 /* This client is in a MULTI context */
189 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
191
192 /* Slave replication state - slave side */
193 #define REDIS_REPL_NONE 0 /* No active replication */
194 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
195 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
197 /* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
206 /* List related stuff */
207 #define REDIS_HEAD 0
208 #define REDIS_TAIL 1
209
210 /* Sort operations */
211 #define REDIS_SORT_GET 0
212 #define REDIS_SORT_ASC 1
213 #define REDIS_SORT_DESC 2
214 #define REDIS_SORTKEY_MAX 1024
215
216 /* Log levels */
217 #define REDIS_DEBUG 0
218 #define REDIS_VERBOSE 1
219 #define REDIS_NOTICE 2
220 #define REDIS_WARNING 3
221
222 /* Anti-warning macro... */
223 #define REDIS_NOTUSED(V) ((void) V)
224
225 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
227
228 /* Append only defines */
229 #define APPENDFSYNC_NO 0
230 #define APPENDFSYNC_ALWAYS 1
231 #define APPENDFSYNC_EVERYSEC 2
232
233 /* Hashes related defaults */
234 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
237 /* We can print the stacktrace, so our assert is defined this way: */
238 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
239 static void _redisAssert(char *estr, char *file, int line);
240
241 /*================================= Data types ============================== */
242
243 /* A redis object, that is a type able to hold a string / list / set */
244
245 /* The VM object structure */
246 struct redisObjectVM {
247 off_t page; /* the page at witch the object is stored on disk */
248 off_t usedpages; /* number of pages used on disk */
249 time_t atime; /* Last access time */
250 } vm;
251
252 /* The actual Redis Object */
253 typedef struct redisObject {
254 void *ptr;
255 unsigned char type;
256 unsigned char encoding;
257 unsigned char storage; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
261 int refcount;
262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm;
267 } robj;
268
269 /* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273 #define initStaticStringObject(_var,_ptr) do { \
274 _var.refcount = 1; \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
277 _var.ptr = _ptr; \
278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
279 } while(0);
280
281 typedef struct redisDb {
282 dict *dict; /* The keyspace for this DB */
283 dict *expires; /* Timeout of keys with a timeout set */
284 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
285 dict *io_keys; /* Keys with clients waiting for VM I/O */
286 int id;
287 } redisDb;
288
289 /* Client MULTI/EXEC state */
290 typedef struct multiCmd {
291 robj **argv;
292 int argc;
293 struct redisCommand *cmd;
294 } multiCmd;
295
296 typedef struct multiState {
297 multiCmd *commands; /* Array of MULTI commands */
298 int count; /* Total number of MULTI commands */
299 } multiState;
300
301 /* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303 typedef struct redisClient {
304 int fd;
305 redisDb *db;
306 int dictid;
307 sds querybuf;
308 robj **argv, **mbargv;
309 int argc, mbargc;
310 int bulklen; /* bulk read len. -1 if not in bulk read mode */
311 int multibulk; /* multi bulk command format active */
312 list *reply;
313 int sentlen;
314 time_t lastinteraction; /* time of the last interaction, used for timeout */
315 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
316 int slaveseldb; /* slave selected db, if this client is a slave */
317 int authenticated; /* when requirepass is non-NULL */
318 int replstate; /* replication state if this is a slave */
319 int repldbfd; /* replication DB file descriptor */
320 long repldboff; /* replication DB file offset */
321 off_t repldbsize; /* replication DB file size */
322 multiState mstate; /* MULTI/EXEC state */
323 robj **blockingkeys; /* The key we are waiting to terminate a blocking
324 * operation such as BLPOP. Otherwise NULL. */
325 int blockingkeysnum; /* Number of blocking keys */
326 time_t blockingto; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
328 list *io_keys; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
330 } redisClient;
331
332 struct saveparam {
333 time_t seconds;
334 int changes;
335 };
336
337 /* Global server state structure */
338 struct redisServer {
339 int port;
340 int fd;
341 redisDb *db;
342 dict *sharingpool; /* Poll used for object sharing */
343 unsigned int sharingpoolsize;
344 long long dirty; /* changes to DB from the last save */
345 list *clients;
346 list *slaves, *monitors;
347 char neterr[ANET_ERR_LEN];
348 aeEventLoop *el;
349 int cronloops; /* number of times the cron function run */
350 list *objfreelist; /* A list of freed objects to avoid malloc() */
351 time_t lastsave; /* Unix time of last save succeeede */
352 /* Fields used only for stats */
353 time_t stat_starttime; /* server start time */
354 long long stat_numcommands; /* number of processed commands */
355 long long stat_numconnections; /* number of connections received */
356 long long stat_expiredkeys; /* number of expired keys */
357 /* Configuration */
358 int verbosity;
359 int glueoutputbuf;
360 int maxidletime;
361 int dbnum;
362 int daemonize;
363 int appendonly;
364 int appendfsync;
365 time_t lastfsync;
366 int appendfd;
367 int appendseldb;
368 char *pidfile;
369 pid_t bgsavechildpid;
370 pid_t bgrewritechildpid;
371 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
372 struct saveparam *saveparams;
373 int saveparamslen;
374 char *logfile;
375 char *bindaddr;
376 char *dbfilename;
377 char *appendfilename;
378 char *requirepass;
379 int shareobjects;
380 int rdbcompression;
381 /* Replication related */
382 int isslave;
383 char *masterauth;
384 char *masterhost;
385 int masterport;
386 redisClient *master; /* client that is master for this slave */
387 int replstate;
388 unsigned int maxclients;
389 unsigned long long maxmemory;
390 unsigned int blpop_blocked_clients;
391 unsigned int vm_blocked_clients;
392 /* Sort parameters - qsort_r() is only available under BSD so we
393 * have to take this state global, in order to pass it to sortCompare() */
394 int sort_desc;
395 int sort_alpha;
396 int sort_bypattern;
397 /* Virtual memory configuration */
398 int vm_enabled;
399 char *vm_swap_file;
400 off_t vm_page_size;
401 off_t vm_pages;
402 unsigned long long vm_max_memory;
403 /* Hashes config */
404 size_t hash_max_zipmap_entries;
405 size_t hash_max_zipmap_value;
406 /* Virtual memory state */
407 FILE *vm_fp;
408 int vm_fd;
409 off_t vm_next_page; /* Next probably empty page */
410 off_t vm_near_pages; /* Number of pages allocated sequentially */
411 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
412 time_t unixtime; /* Unix time sampled every second. */
413 /* Virtual memory I/O threads stuff */
414 /* An I/O thread process an element taken from the io_jobs queue and
415 * put the result of the operation in the io_done list. While the
416 * job is being processed, it's put on io_processing queue. */
417 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
418 list *io_processing; /* List of VM I/O jobs being processed */
419 list *io_processed; /* List of VM I/O jobs already processed */
420 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
421 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
422 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
423 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
424 pthread_attr_t io_threads_attr; /* attributes for threads creation */
425 int io_active_threads; /* Number of running I/O threads */
426 int vm_max_threads; /* Max number of I/O threads running at the same time */
427 /* Our main thread is blocked on the event loop, locking for sockets ready
428 * to be read or written, so when a threaded I/O operation is ready to be
429 * processed by the main thread, the I/O thread will use a unix pipe to
430 * awake the main thread. The followings are the two pipe FDs. */
431 int io_ready_pipe_read;
432 int io_ready_pipe_write;
433 /* Virtual memory stats */
434 unsigned long long vm_stats_used_pages;
435 unsigned long long vm_stats_swapped_objects;
436 unsigned long long vm_stats_swapouts;
437 unsigned long long vm_stats_swapins;
438 FILE *devnull;
439 };
440
441 typedef void redisCommandProc(redisClient *c);
442 struct redisCommand {
443 char *name;
444 redisCommandProc *proc;
445 int arity;
446 int flags;
447 /* Use a function to determine which keys need to be loaded
448 * in the background prior to executing this command. Takes precedence
449 * over vm_firstkey and others, ignored when NULL */
450 redisCommandProc *vm_preload_proc;
451 /* What keys should be loaded in background when calling this command? */
452 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
453 int vm_lastkey; /* THe last argument that's a key */
454 int vm_keystep; /* The step between first and last key */
455 };
456
457 struct redisFunctionSym {
458 char *name;
459 unsigned long pointer;
460 };
461
462 typedef struct _redisSortObject {
463 robj *obj;
464 union {
465 double score;
466 robj *cmpobj;
467 } u;
468 } redisSortObject;
469
470 typedef struct _redisSortOperation {
471 int type;
472 robj *pattern;
473 } redisSortOperation;
474
475 /* ZSETs use a specialized version of Skiplists */
476
477 typedef struct zskiplistNode {
478 struct zskiplistNode **forward;
479 struct zskiplistNode *backward;
480 unsigned int *span;
481 double score;
482 robj *obj;
483 } zskiplistNode;
484
485 typedef struct zskiplist {
486 struct zskiplistNode *header, *tail;
487 unsigned long length;
488 int level;
489 } zskiplist;
490
491 typedef struct zset {
492 dict *dict;
493 zskiplist *zsl;
494 } zset;
495
496 /* Our shared "common" objects */
497
498 struct sharedObjectsStruct {
499 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
500 *colon, *nullbulk, *nullmultibulk, *queued,
501 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
502 *outofrangeerr, *plus,
503 *select0, *select1, *select2, *select3, *select4,
504 *select5, *select6, *select7, *select8, *select9;
505 } shared;
506
507 /* Global vars that are actally used as constants. The following double
508 * values are used for double on-disk serialization, and are initialized
509 * at runtime to avoid strange compiler optimizations. */
510
511 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
512
513 /* VM threaded I/O request message */
514 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
515 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
516 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
517 typedef struct iojob {
518 int type; /* Request type, REDIS_IOJOB_* */
519 redisDb *db;/* Redis database */
520 robj *key; /* This I/O request is about swapping this key */
521 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
522 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
523 off_t page; /* Swap page where to read/write the object */
524 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
525 int canceled; /* True if this command was canceled by blocking side of VM */
526 pthread_t thread; /* ID of the thread processing this entry */
527 } iojob;
528
529 /*================================ Prototypes =============================== */
530
531 static void freeStringObject(robj *o);
532 static void freeListObject(robj *o);
533 static void freeSetObject(robj *o);
534 static void decrRefCount(void *o);
535 static robj *createObject(int type, void *ptr);
536 static void freeClient(redisClient *c);
537 static int rdbLoad(char *filename);
538 static void addReply(redisClient *c, robj *obj);
539 static void addReplySds(redisClient *c, sds s);
540 static void incrRefCount(robj *o);
541 static int rdbSaveBackground(char *filename);
542 static robj *createStringObject(char *ptr, size_t len);
543 static robj *dupStringObject(robj *o);
544 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
545 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
546 static int syncWithMaster(void);
547 static robj *tryObjectSharing(robj *o);
548 static int tryObjectEncoding(robj *o);
549 static robj *getDecodedObject(robj *o);
550 static int removeExpire(redisDb *db, robj *key);
551 static int expireIfNeeded(redisDb *db, robj *key);
552 static int deleteIfVolatile(redisDb *db, robj *key);
553 static int deleteIfSwapped(redisDb *db, robj *key);
554 static int deleteKey(redisDb *db, robj *key);
555 static time_t getExpire(redisDb *db, robj *key);
556 static int setExpire(redisDb *db, robj *key, time_t when);
557 static void updateSlavesWaitingBgsave(int bgsaveerr);
558 static void freeMemoryIfNeeded(void);
559 static int processCommand(redisClient *c);
560 static void setupSigSegvAction(void);
561 static void rdbRemoveTempFile(pid_t childpid);
562 static void aofRemoveTempFile(pid_t childpid);
563 static size_t stringObjectLen(robj *o);
564 static void processInputBuffer(redisClient *c);
565 static zskiplist *zslCreate(void);
566 static void zslFree(zskiplist *zsl);
567 static void zslInsert(zskiplist *zsl, double score, robj *obj);
568 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
569 static void initClientMultiState(redisClient *c);
570 static void freeClientMultiState(redisClient *c);
571 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
572 static void unblockClientWaitingData(redisClient *c);
573 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
574 static void vmInit(void);
575 static void vmMarkPagesFree(off_t page, off_t count);
576 static robj *vmLoadObject(robj *key);
577 static robj *vmPreviewObject(robj *key);
578 static int vmSwapOneObjectBlocking(void);
579 static int vmSwapOneObjectThreaded(void);
580 static int vmCanSwapOut(void);
581 static int tryFreeOneObjectFromFreelist(void);
582 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
583 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
584 static void vmCancelThreadedIOJob(robj *o);
585 static void lockThreadedIO(void);
586 static void unlockThreadedIO(void);
587 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
588 static void freeIOJob(iojob *j);
589 static void queueIOJob(iojob *j);
590 static int vmWriteObjectOnSwap(robj *o, off_t page);
591 static robj *vmReadObjectFromSwap(off_t page, int type);
592 static void waitEmptyIOJobsQueue(void);
593 static void vmReopenSwapFile(void);
594 static int vmFreePage(off_t page);
595 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
596 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
597 static int dontWaitForSwappedKey(redisClient *c, robj *key);
598 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
599 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
600 static struct redisCommand *lookupCommand(char *name);
601 static void call(redisClient *c, struct redisCommand *cmd);
602 static void resetClient(redisClient *c);
603 static void convertToRealHash(robj *o);
604
605 static void authCommand(redisClient *c);
606 static void pingCommand(redisClient *c);
607 static void echoCommand(redisClient *c);
608 static void setCommand(redisClient *c);
609 static void setnxCommand(redisClient *c);
610 static void getCommand(redisClient *c);
611 static void delCommand(redisClient *c);
612 static void existsCommand(redisClient *c);
613 static void incrCommand(redisClient *c);
614 static void decrCommand(redisClient *c);
615 static void incrbyCommand(redisClient *c);
616 static void decrbyCommand(redisClient *c);
617 static void selectCommand(redisClient *c);
618 static void randomkeyCommand(redisClient *c);
619 static void keysCommand(redisClient *c);
620 static void dbsizeCommand(redisClient *c);
621 static void lastsaveCommand(redisClient *c);
622 static void saveCommand(redisClient *c);
623 static void bgsaveCommand(redisClient *c);
624 static void bgrewriteaofCommand(redisClient *c);
625 static void shutdownCommand(redisClient *c);
626 static void moveCommand(redisClient *c);
627 static void renameCommand(redisClient *c);
628 static void renamenxCommand(redisClient *c);
629 static void lpushCommand(redisClient *c);
630 static void rpushCommand(redisClient *c);
631 static void lpopCommand(redisClient *c);
632 static void rpopCommand(redisClient *c);
633 static void llenCommand(redisClient *c);
634 static void lindexCommand(redisClient *c);
635 static void lrangeCommand(redisClient *c);
636 static void ltrimCommand(redisClient *c);
637 static void typeCommand(redisClient *c);
638 static void lsetCommand(redisClient *c);
639 static void saddCommand(redisClient *c);
640 static void sremCommand(redisClient *c);
641 static void smoveCommand(redisClient *c);
642 static void sismemberCommand(redisClient *c);
643 static void scardCommand(redisClient *c);
644 static void spopCommand(redisClient *c);
645 static void srandmemberCommand(redisClient *c);
646 static void sinterCommand(redisClient *c);
647 static void sinterstoreCommand(redisClient *c);
648 static void sunionCommand(redisClient *c);
649 static void sunionstoreCommand(redisClient *c);
650 static void sdiffCommand(redisClient *c);
651 static void sdiffstoreCommand(redisClient *c);
652 static void syncCommand(redisClient *c);
653 static void flushdbCommand(redisClient *c);
654 static void flushallCommand(redisClient *c);
655 static void sortCommand(redisClient *c);
656 static void lremCommand(redisClient *c);
657 static void rpoplpushcommand(redisClient *c);
658 static void infoCommand(redisClient *c);
659 static void mgetCommand(redisClient *c);
660 static void monitorCommand(redisClient *c);
661 static void expireCommand(redisClient *c);
662 static void expireatCommand(redisClient *c);
663 static void getsetCommand(redisClient *c);
664 static void ttlCommand(redisClient *c);
665 static void slaveofCommand(redisClient *c);
666 static void debugCommand(redisClient *c);
667 static void msetCommand(redisClient *c);
668 static void msetnxCommand(redisClient *c);
669 static void zaddCommand(redisClient *c);
670 static void zincrbyCommand(redisClient *c);
671 static void zrangeCommand(redisClient *c);
672 static void zrangebyscoreCommand(redisClient *c);
673 static void zcountCommand(redisClient *c);
674 static void zrevrangeCommand(redisClient *c);
675 static void zcardCommand(redisClient *c);
676 static void zremCommand(redisClient *c);
677 static void zscoreCommand(redisClient *c);
678 static void zremrangebyscoreCommand(redisClient *c);
679 static void multiCommand(redisClient *c);
680 static void execCommand(redisClient *c);
681 static void discardCommand(redisClient *c);
682 static void blpopCommand(redisClient *c);
683 static void brpopCommand(redisClient *c);
684 static void appendCommand(redisClient *c);
685 static void substrCommand(redisClient *c);
686 static void zrankCommand(redisClient *c);
687 static void zrevrankCommand(redisClient *c);
688 static void hsetCommand(redisClient *c);
689 static void hgetCommand(redisClient *c);
690 static void hdelCommand(redisClient *c);
691 static void hlenCommand(redisClient *c);
692 static void zremrangebyrankCommand(redisClient *c);
693 static void zunionCommand(redisClient *c);
694 static void zinterCommand(redisClient *c);
695 static void hkeysCommand(redisClient *c);
696 static void hvalsCommand(redisClient *c);
697 static void hgetallCommand(redisClient *c);
698 static void hexistsCommand(redisClient *c);
699 static void configCommand(redisClient *c);
700
701 /*================================= Globals ================================= */
702
703 /* Global vars */
704 static struct redisServer server; /* server global state */
705 static struct redisCommand cmdTable[] = {
706 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
707 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
708 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
709 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
710 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
711 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
712 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
713 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
714 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
715 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
716 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
717 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
718 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
719 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
720 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
721 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
722 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
723 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
724 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
725 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
726 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
727 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
728 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
729 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
730 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
731 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
732 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
733 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
734 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
735 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
737 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
738 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
739 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
740 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
741 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
742 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
745 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
746 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
749 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
750 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
758 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
759 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
760 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
761 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
766 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
770 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
771 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
772 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
773 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
777 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
778 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
779 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
780 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
781 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
782 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
783 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
784 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
785 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
786 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
787 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
788 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
790 {"exec",execCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
791 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
793 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
795 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
798 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
800 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
802 {NULL,NULL,0,0,NULL,0,0,0}
803 };
804
805 static void usage();
806
807 /*============================ Utility functions ============================ */
808
809 /* Glob-style pattern matching. */
810 static int stringmatchlen(const char *pattern, int patternLen,
811 const char *string, int stringLen, int nocase)
812 {
813 while(patternLen) {
814 switch(pattern[0]) {
815 case '*':
816 while (pattern[1] == '*') {
817 pattern++;
818 patternLen--;
819 }
820 if (patternLen == 1)
821 return 1; /* match */
822 while(stringLen) {
823 if (stringmatchlen(pattern+1, patternLen-1,
824 string, stringLen, nocase))
825 return 1; /* match */
826 string++;
827 stringLen--;
828 }
829 return 0; /* no match */
830 break;
831 case '?':
832 if (stringLen == 0)
833 return 0; /* no match */
834 string++;
835 stringLen--;
836 break;
837 case '[':
838 {
839 int not, match;
840
841 pattern++;
842 patternLen--;
843 not = pattern[0] == '^';
844 if (not) {
845 pattern++;
846 patternLen--;
847 }
848 match = 0;
849 while(1) {
850 if (pattern[0] == '\\') {
851 pattern++;
852 patternLen--;
853 if (pattern[0] == string[0])
854 match = 1;
855 } else if (pattern[0] == ']') {
856 break;
857 } else if (patternLen == 0) {
858 pattern--;
859 patternLen++;
860 break;
861 } else if (pattern[1] == '-' && patternLen >= 3) {
862 int start = pattern[0];
863 int end = pattern[2];
864 int c = string[0];
865 if (start > end) {
866 int t = start;
867 start = end;
868 end = t;
869 }
870 if (nocase) {
871 start = tolower(start);
872 end = tolower(end);
873 c = tolower(c);
874 }
875 pattern += 2;
876 patternLen -= 2;
877 if (c >= start && c <= end)
878 match = 1;
879 } else {
880 if (!nocase) {
881 if (pattern[0] == string[0])
882 match = 1;
883 } else {
884 if (tolower((int)pattern[0]) == tolower((int)string[0]))
885 match = 1;
886 }
887 }
888 pattern++;
889 patternLen--;
890 }
891 if (not)
892 match = !match;
893 if (!match)
894 return 0; /* no match */
895 string++;
896 stringLen--;
897 break;
898 }
899 case '\\':
900 if (patternLen >= 2) {
901 pattern++;
902 patternLen--;
903 }
904 /* fall through */
905 default:
906 if (!nocase) {
907 if (pattern[0] != string[0])
908 return 0; /* no match */
909 } else {
910 if (tolower((int)pattern[0]) != tolower((int)string[0]))
911 return 0; /* no match */
912 }
913 string++;
914 stringLen--;
915 break;
916 }
917 pattern++;
918 patternLen--;
919 if (stringLen == 0) {
920 while(*pattern == '*') {
921 pattern++;
922 patternLen--;
923 }
924 break;
925 }
926 }
927 if (patternLen == 0 && stringLen == 0)
928 return 1;
929 return 0;
930 }
931
932 static int stringmatch(const char *pattern, const char *string, int nocase) {
933 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
934 }
935
936 static void redisLog(int level, const char *fmt, ...) {
937 va_list ap;
938 FILE *fp;
939
940 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
941 if (!fp) return;
942
943 va_start(ap, fmt);
944 if (level >= server.verbosity) {
945 char *c = ".-*#";
946 char buf[64];
947 time_t now;
948
949 now = time(NULL);
950 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
951 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
952 vfprintf(fp, fmt, ap);
953 fprintf(fp,"\n");
954 fflush(fp);
955 }
956 va_end(ap);
957
958 if (server.logfile) fclose(fp);
959 }
960
961 /*====================== Hash table type implementation ==================== */
962
963 /* This is an hash table type that uses the SDS dynamic strings libary as
964 * keys and radis objects as values (objects can hold SDS strings,
965 * lists, sets). */
966
967 static void dictVanillaFree(void *privdata, void *val)
968 {
969 DICT_NOTUSED(privdata);
970 zfree(val);
971 }
972
973 static void dictListDestructor(void *privdata, void *val)
974 {
975 DICT_NOTUSED(privdata);
976 listRelease((list*)val);
977 }
978
979 static int sdsDictKeyCompare(void *privdata, const void *key1,
980 const void *key2)
981 {
982 int l1,l2;
983 DICT_NOTUSED(privdata);
984
985 l1 = sdslen((sds)key1);
986 l2 = sdslen((sds)key2);
987 if (l1 != l2) return 0;
988 return memcmp(key1, key2, l1) == 0;
989 }
990
991 static void dictRedisObjectDestructor(void *privdata, void *val)
992 {
993 DICT_NOTUSED(privdata);
994
995 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
996 decrRefCount(val);
997 }
998
999 static int dictObjKeyCompare(void *privdata, const void *key1,
1000 const void *key2)
1001 {
1002 const robj *o1 = key1, *o2 = key2;
1003 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1004 }
1005
1006 static unsigned int dictObjHash(const void *key) {
1007 const robj *o = key;
1008 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1009 }
1010
1011 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1012 const void *key2)
1013 {
1014 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1015 int cmp;
1016
1017 if (o1->encoding == REDIS_ENCODING_INT &&
1018 o2->encoding == REDIS_ENCODING_INT &&
1019 o1->ptr == o2->ptr) return 1;
1020
1021 o1 = getDecodedObject(o1);
1022 o2 = getDecodedObject(o2);
1023 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1024 decrRefCount(o1);
1025 decrRefCount(o2);
1026 return cmp;
1027 }
1028
1029 static unsigned int dictEncObjHash(const void *key) {
1030 robj *o = (robj*) key;
1031
1032 if (o->encoding == REDIS_ENCODING_RAW) {
1033 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1034 } else {
1035 if (o->encoding == REDIS_ENCODING_INT) {
1036 char buf[32];
1037 int len;
1038
1039 len = snprintf(buf,32,"%ld",(long)o->ptr);
1040 return dictGenHashFunction((unsigned char*)buf, len);
1041 } else {
1042 unsigned int hash;
1043
1044 o = getDecodedObject(o);
1045 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1046 decrRefCount(o);
1047 return hash;
1048 }
1049 }
1050 }
1051
1052 /* Sets type and expires */
1053 static dictType setDictType = {
1054 dictEncObjHash, /* hash function */
1055 NULL, /* key dup */
1056 NULL, /* val dup */
1057 dictEncObjKeyCompare, /* key compare */
1058 dictRedisObjectDestructor, /* key destructor */
1059 NULL /* val destructor */
1060 };
1061
1062 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1063 static dictType zsetDictType = {
1064 dictEncObjHash, /* hash function */
1065 NULL, /* key dup */
1066 NULL, /* val dup */
1067 dictEncObjKeyCompare, /* key compare */
1068 dictRedisObjectDestructor, /* key destructor */
1069 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1070 };
1071
1072 /* Db->dict */
1073 static dictType dbDictType = {
1074 dictObjHash, /* hash function */
1075 NULL, /* key dup */
1076 NULL, /* val dup */
1077 dictObjKeyCompare, /* key compare */
1078 dictRedisObjectDestructor, /* key destructor */
1079 dictRedisObjectDestructor /* val destructor */
1080 };
1081
1082 /* Db->expires */
1083 static dictType keyptrDictType = {
1084 dictObjHash, /* hash function */
1085 NULL, /* key dup */
1086 NULL, /* val dup */
1087 dictObjKeyCompare, /* key compare */
1088 dictRedisObjectDestructor, /* key destructor */
1089 NULL /* val destructor */
1090 };
1091
1092 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1093 static dictType hashDictType = {
1094 dictEncObjHash, /* hash function */
1095 NULL, /* key dup */
1096 NULL, /* val dup */
1097 dictEncObjKeyCompare, /* key compare */
1098 dictRedisObjectDestructor, /* key destructor */
1099 dictRedisObjectDestructor /* val destructor */
1100 };
1101
1102 /* Keylist hash table type has unencoded redis objects as keys and
1103 * lists as values. It's used for blocking operations (BLPOP) and to
1104 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1105 static dictType keylistDictType = {
1106 dictObjHash, /* hash function */
1107 NULL, /* key dup */
1108 NULL, /* val dup */
1109 dictObjKeyCompare, /* key compare */
1110 dictRedisObjectDestructor, /* key destructor */
1111 dictListDestructor /* val destructor */
1112 };
1113
1114 static void version();
1115
1116 /* ========================= Random utility functions ======================= */
1117
1118 /* Redis generally does not try to recover from out of memory conditions
1119 * when allocating objects or strings, it is not clear if it will be possible
1120 * to report this condition to the client since the networking layer itself
1121 * is based on heap allocation for send buffers, so we simply abort.
1122 * At least the code will be simpler to read... */
1123 static void oom(const char *msg) {
1124 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1125 sleep(1);
1126 abort();
1127 }
1128
1129 /* ====================== Redis server networking stuff ===================== */
1130 static void closeTimedoutClients(void) {
1131 redisClient *c;
1132 listNode *ln;
1133 time_t now = time(NULL);
1134 listIter li;
1135
1136 listRewind(server.clients,&li);
1137 while ((ln = listNext(&li)) != NULL) {
1138 c = listNodeValue(ln);
1139 if (server.maxidletime &&
1140 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1141 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1142 (now - c->lastinteraction > server.maxidletime))
1143 {
1144 redisLog(REDIS_VERBOSE,"Closing idle client");
1145 freeClient(c);
1146 } else if (c->flags & REDIS_BLOCKED) {
1147 if (c->blockingto != 0 && c->blockingto < now) {
1148 addReply(c,shared.nullmultibulk);
1149 unblockClientWaitingData(c);
1150 }
1151 }
1152 }
1153 }
1154
1155 static int htNeedsResize(dict *dict) {
1156 long long size, used;
1157
1158 size = dictSlots(dict);
1159 used = dictSize(dict);
1160 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1161 (used*100/size < REDIS_HT_MINFILL));
1162 }
1163
1164 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1165 * we resize the hash table to save memory */
1166 static void tryResizeHashTables(void) {
1167 int j;
1168
1169 for (j = 0; j < server.dbnum; j++) {
1170 if (htNeedsResize(server.db[j].dict)) {
1171 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1172 dictResize(server.db[j].dict);
1173 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1174 }
1175 if (htNeedsResize(server.db[j].expires))
1176 dictResize(server.db[j].expires);
1177 }
1178 }
1179
1180 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1181 void backgroundSaveDoneHandler(int statloc) {
1182 int exitcode = WEXITSTATUS(statloc);
1183 int bysignal = WIFSIGNALED(statloc);
1184
1185 if (!bysignal && exitcode == 0) {
1186 redisLog(REDIS_NOTICE,
1187 "Background saving terminated with success");
1188 server.dirty = 0;
1189 server.lastsave = time(NULL);
1190 } else if (!bysignal && exitcode != 0) {
1191 redisLog(REDIS_WARNING, "Background saving error");
1192 } else {
1193 redisLog(REDIS_WARNING,
1194 "Background saving terminated by signal");
1195 rdbRemoveTempFile(server.bgsavechildpid);
1196 }
1197 server.bgsavechildpid = -1;
1198 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1199 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1200 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1201 }
1202
1203 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1204 * Handle this. */
1205 void backgroundRewriteDoneHandler(int statloc) {
1206 int exitcode = WEXITSTATUS(statloc);
1207 int bysignal = WIFSIGNALED(statloc);
1208
1209 if (!bysignal && exitcode == 0) {
1210 int fd;
1211 char tmpfile[256];
1212
1213 redisLog(REDIS_NOTICE,
1214 "Background append only file rewriting terminated with success");
1215 /* Now it's time to flush the differences accumulated by the parent */
1216 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1217 fd = open(tmpfile,O_WRONLY|O_APPEND);
1218 if (fd == -1) {
1219 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1220 goto cleanup;
1221 }
1222 /* Flush our data... */
1223 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1224 (signed) sdslen(server.bgrewritebuf)) {
1225 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1226 close(fd);
1227 goto cleanup;
1228 }
1229 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1230 /* Now our work is to rename the temp file into the stable file. And
1231 * switch the file descriptor used by the server for append only. */
1232 if (rename(tmpfile,server.appendfilename) == -1) {
1233 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1234 close(fd);
1235 goto cleanup;
1236 }
1237 /* Mission completed... almost */
1238 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1239 if (server.appendfd != -1) {
1240 /* If append only is actually enabled... */
1241 close(server.appendfd);
1242 server.appendfd = fd;
1243 fsync(fd);
1244 server.appendseldb = -1; /* Make sure it will issue SELECT */
1245 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1246 } else {
1247 /* If append only is disabled we just generate a dump in this
1248 * format. Why not? */
1249 close(fd);
1250 }
1251 } else if (!bysignal && exitcode != 0) {
1252 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1253 } else {
1254 redisLog(REDIS_WARNING,
1255 "Background append only file rewriting terminated by signal");
1256 }
1257 cleanup:
1258 sdsfree(server.bgrewritebuf);
1259 server.bgrewritebuf = sdsempty();
1260 aofRemoveTempFile(server.bgrewritechildpid);
1261 server.bgrewritechildpid = -1;
1262 }
1263
1264 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1265 int j, loops = server.cronloops++;
1266 REDIS_NOTUSED(eventLoop);
1267 REDIS_NOTUSED(id);
1268 REDIS_NOTUSED(clientData);
1269
1270 /* We take a cached value of the unix time in the global state because
1271 * with virtual memory and aging there is to store the current time
1272 * in objects at every object access, and accuracy is not needed.
1273 * To access a global var is faster than calling time(NULL) */
1274 server.unixtime = time(NULL);
1275
1276 /* Show some info about non-empty databases */
1277 for (j = 0; j < server.dbnum; j++) {
1278 long long size, used, vkeys;
1279
1280 size = dictSlots(server.db[j].dict);
1281 used = dictSize(server.db[j].dict);
1282 vkeys = dictSize(server.db[j].expires);
1283 if (!(loops % 50) && (used || vkeys)) {
1284 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1285 /* dictPrintStats(server.dict); */
1286 }
1287 }
1288
1289 /* We don't want to resize the hash tables while a bacground saving
1290 * is in progress: the saving child is created using fork() that is
1291 * implemented with a copy-on-write semantic in most modern systems, so
1292 * if we resize the HT while there is the saving child at work actually
1293 * a lot of memory movements in the parent will cause a lot of pages
1294 * copied. */
1295 if (server.bgsavechildpid == -1 && !(loops % 10)) tryResizeHashTables();
1296
1297 /* Show information about connected clients */
1298 if (!(loops % 50)) {
1299 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1300 listLength(server.clients)-listLength(server.slaves),
1301 listLength(server.slaves),
1302 zmalloc_used_memory(),
1303 dictSize(server.sharingpool));
1304 }
1305
1306 /* Close connections of timedout clients */
1307 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1308 closeTimedoutClients();
1309
1310 /* Check if a background saving or AOF rewrite in progress terminated */
1311 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1312 int statloc;
1313 pid_t pid;
1314
1315 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1316 if (pid == server.bgsavechildpid) {
1317 backgroundSaveDoneHandler(statloc);
1318 } else {
1319 backgroundRewriteDoneHandler(statloc);
1320 }
1321 }
1322 } else {
1323 /* If there is not a background saving in progress check if
1324 * we have to save now */
1325 time_t now = time(NULL);
1326 for (j = 0; j < server.saveparamslen; j++) {
1327 struct saveparam *sp = server.saveparams+j;
1328
1329 if (server.dirty >= sp->changes &&
1330 now-server.lastsave > sp->seconds) {
1331 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1332 sp->changes, sp->seconds);
1333 rdbSaveBackground(server.dbfilename);
1334 break;
1335 }
1336 }
1337 }
1338
1339 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1340 * will use few CPU cycles if there are few expiring keys, otherwise
1341 * it will get more aggressive to avoid that too much memory is used by
1342 * keys that can be removed from the keyspace. */
1343 for (j = 0; j < server.dbnum; j++) {
1344 int expired;
1345 redisDb *db = server.db+j;
1346
1347 /* Continue to expire if at the end of the cycle more than 25%
1348 * of the keys were expired. */
1349 do {
1350 long num = dictSize(db->expires);
1351 time_t now = time(NULL);
1352
1353 expired = 0;
1354 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1355 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1356 while (num--) {
1357 dictEntry *de;
1358 time_t t;
1359
1360 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1361 t = (time_t) dictGetEntryVal(de);
1362 if (now > t) {
1363 deleteKey(db,dictGetEntryKey(de));
1364 expired++;
1365 server.stat_expiredkeys++;
1366 }
1367 }
1368 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1369 }
1370
1371 /* Swap a few keys on disk if we are over the memory limit and VM
1372 * is enbled. Try to free objects from the free list first. */
1373 if (vmCanSwapOut()) {
1374 while (server.vm_enabled && zmalloc_used_memory() >
1375 server.vm_max_memory)
1376 {
1377 int retval;
1378
1379 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1380 retval = (server.vm_max_threads == 0) ?
1381 vmSwapOneObjectBlocking() :
1382 vmSwapOneObjectThreaded();
1383 if (retval == REDIS_ERR && !(loops % 300) &&
1384 zmalloc_used_memory() >
1385 (server.vm_max_memory+server.vm_max_memory/10))
1386 {
1387 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1388 }
1389 /* Note that when using threade I/O we free just one object,
1390 * because anyway when the I/O thread in charge to swap this
1391 * object out will finish, the handler of completed jobs
1392 * will try to swap more objects if we are still out of memory. */
1393 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1394 }
1395 }
1396
1397 /* Check if we should connect to a MASTER */
1398 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1399 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1400 if (syncWithMaster() == REDIS_OK) {
1401 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1402 }
1403 }
1404 return 100;
1405 }
1406
1407 /* This function gets called every time Redis is entering the
1408 * main loop of the event driven library, that is, before to sleep
1409 * for ready file descriptors. */
1410 static void beforeSleep(struct aeEventLoop *eventLoop) {
1411 REDIS_NOTUSED(eventLoop);
1412
1413 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1414 listIter li;
1415 listNode *ln;
1416
1417 listRewind(server.io_ready_clients,&li);
1418 while((ln = listNext(&li))) {
1419 redisClient *c = ln->value;
1420 struct redisCommand *cmd;
1421
1422 /* Resume the client. */
1423 listDelNode(server.io_ready_clients,ln);
1424 c->flags &= (~REDIS_IO_WAIT);
1425 server.vm_blocked_clients--;
1426 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1427 readQueryFromClient, c);
1428 cmd = lookupCommand(c->argv[0]->ptr);
1429 assert(cmd != NULL);
1430 call(c,cmd);
1431 resetClient(c);
1432 /* There may be more data to process in the input buffer. */
1433 if (c->querybuf && sdslen(c->querybuf) > 0)
1434 processInputBuffer(c);
1435 }
1436 }
1437 }
1438
1439 static void createSharedObjects(void) {
1440 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1441 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1442 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1443 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1444 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1445 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1446 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1447 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1448 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1449 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1450 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1451 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1452 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1453 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1454 "-ERR no such key\r\n"));
1455 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1456 "-ERR syntax error\r\n"));
1457 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1458 "-ERR source and destination objects are the same\r\n"));
1459 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1460 "-ERR index out of range\r\n"));
1461 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1462 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1463 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1464 shared.select0 = createStringObject("select 0\r\n",10);
1465 shared.select1 = createStringObject("select 1\r\n",10);
1466 shared.select2 = createStringObject("select 2\r\n",10);
1467 shared.select3 = createStringObject("select 3\r\n",10);
1468 shared.select4 = createStringObject("select 4\r\n",10);
1469 shared.select5 = createStringObject("select 5\r\n",10);
1470 shared.select6 = createStringObject("select 6\r\n",10);
1471 shared.select7 = createStringObject("select 7\r\n",10);
1472 shared.select8 = createStringObject("select 8\r\n",10);
1473 shared.select9 = createStringObject("select 9\r\n",10);
1474 }
1475
1476 static void appendServerSaveParams(time_t seconds, int changes) {
1477 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1478 server.saveparams[server.saveparamslen].seconds = seconds;
1479 server.saveparams[server.saveparamslen].changes = changes;
1480 server.saveparamslen++;
1481 }
1482
1483 static void resetServerSaveParams() {
1484 zfree(server.saveparams);
1485 server.saveparams = NULL;
1486 server.saveparamslen = 0;
1487 }
1488
1489 static void initServerConfig() {
1490 server.dbnum = REDIS_DEFAULT_DBNUM;
1491 server.port = REDIS_SERVERPORT;
1492 server.verbosity = REDIS_VERBOSE;
1493 server.maxidletime = REDIS_MAXIDLETIME;
1494 server.saveparams = NULL;
1495 server.logfile = NULL; /* NULL = log on standard output */
1496 server.bindaddr = NULL;
1497 server.glueoutputbuf = 1;
1498 server.daemonize = 0;
1499 server.appendonly = 0;
1500 server.appendfsync = APPENDFSYNC_ALWAYS;
1501 server.lastfsync = time(NULL);
1502 server.appendfd = -1;
1503 server.appendseldb = -1; /* Make sure the first time will not match */
1504 server.pidfile = zstrdup("/var/run/redis.pid");
1505 server.dbfilename = zstrdup("dump.rdb");
1506 server.appendfilename = zstrdup("appendonly.aof");
1507 server.requirepass = NULL;
1508 server.shareobjects = 0;
1509 server.rdbcompression = 1;
1510 server.sharingpoolsize = 1024;
1511 server.maxclients = 0;
1512 server.blpop_blocked_clients = 0;
1513 server.maxmemory = 0;
1514 server.vm_enabled = 0;
1515 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1516 server.vm_page_size = 256; /* 256 bytes per page */
1517 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1518 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1519 server.vm_max_threads = 4;
1520 server.vm_blocked_clients = 0;
1521 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1522 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1523
1524 resetServerSaveParams();
1525
1526 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1527 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1528 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1529 /* Replication related */
1530 server.isslave = 0;
1531 server.masterauth = NULL;
1532 server.masterhost = NULL;
1533 server.masterport = 6379;
1534 server.master = NULL;
1535 server.replstate = REDIS_REPL_NONE;
1536
1537 /* Double constants initialization */
1538 R_Zero = 0.0;
1539 R_PosInf = 1.0/R_Zero;
1540 R_NegInf = -1.0/R_Zero;
1541 R_Nan = R_Zero/R_Zero;
1542 }
1543
1544 static void initServer() {
1545 int j;
1546
1547 signal(SIGHUP, SIG_IGN);
1548 signal(SIGPIPE, SIG_IGN);
1549 setupSigSegvAction();
1550
1551 server.devnull = fopen("/dev/null","w");
1552 if (server.devnull == NULL) {
1553 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1554 exit(1);
1555 }
1556 server.clients = listCreate();
1557 server.slaves = listCreate();
1558 server.monitors = listCreate();
1559 server.objfreelist = listCreate();
1560 createSharedObjects();
1561 server.el = aeCreateEventLoop();
1562 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1563 server.sharingpool = dictCreate(&setDictType,NULL);
1564 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1565 if (server.fd == -1) {
1566 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1567 exit(1);
1568 }
1569 for (j = 0; j < server.dbnum; j++) {
1570 server.db[j].dict = dictCreate(&dbDictType,NULL);
1571 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1572 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1573 if (server.vm_enabled)
1574 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1575 server.db[j].id = j;
1576 }
1577 server.cronloops = 0;
1578 server.bgsavechildpid = -1;
1579 server.bgrewritechildpid = -1;
1580 server.bgrewritebuf = sdsempty();
1581 server.lastsave = time(NULL);
1582 server.dirty = 0;
1583 server.stat_numcommands = 0;
1584 server.stat_numconnections = 0;
1585 server.stat_expiredkeys = 0;
1586 server.stat_starttime = time(NULL);
1587 server.unixtime = time(NULL);
1588 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1589 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1590 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1591
1592 if (server.appendonly) {
1593 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1594 if (server.appendfd == -1) {
1595 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1596 strerror(errno));
1597 exit(1);
1598 }
1599 }
1600
1601 if (server.vm_enabled) vmInit();
1602 }
1603
1604 /* Empty the whole database */
1605 static long long emptyDb() {
1606 int j;
1607 long long removed = 0;
1608
1609 for (j = 0; j < server.dbnum; j++) {
1610 removed += dictSize(server.db[j].dict);
1611 dictEmpty(server.db[j].dict);
1612 dictEmpty(server.db[j].expires);
1613 }
1614 return removed;
1615 }
1616
1617 static int yesnotoi(char *s) {
1618 if (!strcasecmp(s,"yes")) return 1;
1619 else if (!strcasecmp(s,"no")) return 0;
1620 else return -1;
1621 }
1622
1623 /* I agree, this is a very rudimental way to load a configuration...
1624 will improve later if the config gets more complex */
1625 static void loadServerConfig(char *filename) {
1626 FILE *fp;
1627 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1628 int linenum = 0;
1629 sds line = NULL;
1630 char *errormsg = "Fatal error, can't open config file '%s'";
1631 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1632 sprintf(errorbuf, errormsg, filename);
1633
1634 if (filename[0] == '-' && filename[1] == '\0')
1635 fp = stdin;
1636 else {
1637 if ((fp = fopen(filename,"r")) == NULL) {
1638 redisLog(REDIS_WARNING, errorbuf);
1639 exit(1);
1640 }
1641 }
1642
1643 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1644 sds *argv;
1645 int argc, j;
1646
1647 linenum++;
1648 line = sdsnew(buf);
1649 line = sdstrim(line," \t\r\n");
1650
1651 /* Skip comments and blank lines*/
1652 if (line[0] == '#' || line[0] == '\0') {
1653 sdsfree(line);
1654 continue;
1655 }
1656
1657 /* Split into arguments */
1658 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1659 sdstolower(argv[0]);
1660
1661 /* Execute config directives */
1662 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1663 server.maxidletime = atoi(argv[1]);
1664 if (server.maxidletime < 0) {
1665 err = "Invalid timeout value"; goto loaderr;
1666 }
1667 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1668 server.port = atoi(argv[1]);
1669 if (server.port < 1 || server.port > 65535) {
1670 err = "Invalid port"; goto loaderr;
1671 }
1672 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1673 server.bindaddr = zstrdup(argv[1]);
1674 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1675 int seconds = atoi(argv[1]);
1676 int changes = atoi(argv[2]);
1677 if (seconds < 1 || changes < 0) {
1678 err = "Invalid save parameters"; goto loaderr;
1679 }
1680 appendServerSaveParams(seconds,changes);
1681 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1682 if (chdir(argv[1]) == -1) {
1683 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1684 argv[1], strerror(errno));
1685 exit(1);
1686 }
1687 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1688 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1689 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1690 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1691 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1692 else {
1693 err = "Invalid log level. Must be one of debug, notice, warning";
1694 goto loaderr;
1695 }
1696 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1697 FILE *logfp;
1698
1699 server.logfile = zstrdup(argv[1]);
1700 if (!strcasecmp(server.logfile,"stdout")) {
1701 zfree(server.logfile);
1702 server.logfile = NULL;
1703 }
1704 if (server.logfile) {
1705 /* Test if we are able to open the file. The server will not
1706 * be able to abort just for this problem later... */
1707 logfp = fopen(server.logfile,"a");
1708 if (logfp == NULL) {
1709 err = sdscatprintf(sdsempty(),
1710 "Can't open the log file: %s", strerror(errno));
1711 goto loaderr;
1712 }
1713 fclose(logfp);
1714 }
1715 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1716 server.dbnum = atoi(argv[1]);
1717 if (server.dbnum < 1) {
1718 err = "Invalid number of databases"; goto loaderr;
1719 }
1720 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1721 loadServerConfig(argv[1]);
1722 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1723 server.maxclients = atoi(argv[1]);
1724 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1725 server.maxmemory = strtoll(argv[1], NULL, 10);
1726 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1727 server.masterhost = sdsnew(argv[1]);
1728 server.masterport = atoi(argv[2]);
1729 server.replstate = REDIS_REPL_CONNECT;
1730 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1731 server.masterauth = zstrdup(argv[1]);
1732 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1733 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1734 err = "argument must be 'yes' or 'no'"; goto loaderr;
1735 }
1736 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1737 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1738 err = "argument must be 'yes' or 'no'"; goto loaderr;
1739 }
1740 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1741 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1742 err = "argument must be 'yes' or 'no'"; goto loaderr;
1743 }
1744 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1745 server.sharingpoolsize = atoi(argv[1]);
1746 if (server.sharingpoolsize < 1) {
1747 err = "invalid object sharing pool size"; goto loaderr;
1748 }
1749 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1750 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1751 err = "argument must be 'yes' or 'no'"; goto loaderr;
1752 }
1753 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1754 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1755 err = "argument must be 'yes' or 'no'"; goto loaderr;
1756 }
1757 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1758 if (!strcasecmp(argv[1],"no")) {
1759 server.appendfsync = APPENDFSYNC_NO;
1760 } else if (!strcasecmp(argv[1],"always")) {
1761 server.appendfsync = APPENDFSYNC_ALWAYS;
1762 } else if (!strcasecmp(argv[1],"everysec")) {
1763 server.appendfsync = APPENDFSYNC_EVERYSEC;
1764 } else {
1765 err = "argument must be 'no', 'always' or 'everysec'";
1766 goto loaderr;
1767 }
1768 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1769 server.requirepass = zstrdup(argv[1]);
1770 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1771 zfree(server.pidfile);
1772 server.pidfile = zstrdup(argv[1]);
1773 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1774 zfree(server.dbfilename);
1775 server.dbfilename = zstrdup(argv[1]);
1776 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1777 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1778 err = "argument must be 'yes' or 'no'"; goto loaderr;
1779 }
1780 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1781 zfree(server.vm_swap_file);
1782 server.vm_swap_file = zstrdup(argv[1]);
1783 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1784 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1785 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1786 server.vm_page_size = strtoll(argv[1], NULL, 10);
1787 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1788 server.vm_pages = strtoll(argv[1], NULL, 10);
1789 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1790 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1791 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1792 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1793 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1794 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1795 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1796 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1797 } else {
1798 err = "Bad directive or wrong number of arguments"; goto loaderr;
1799 }
1800 for (j = 0; j < argc; j++)
1801 sdsfree(argv[j]);
1802 zfree(argv);
1803 sdsfree(line);
1804 }
1805 if (fp != stdin) fclose(fp);
1806 return;
1807
1808 loaderr:
1809 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1810 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1811 fprintf(stderr, ">>> '%s'\n", line);
1812 fprintf(stderr, "%s\n", err);
1813 exit(1);
1814 }
1815
1816 static void freeClientArgv(redisClient *c) {
1817 int j;
1818
1819 for (j = 0; j < c->argc; j++)
1820 decrRefCount(c->argv[j]);
1821 for (j = 0; j < c->mbargc; j++)
1822 decrRefCount(c->mbargv[j]);
1823 c->argc = 0;
1824 c->mbargc = 0;
1825 }
1826
1827 static void freeClient(redisClient *c) {
1828 listNode *ln;
1829
1830 /* Note that if the client we are freeing is blocked into a blocking
1831 * call, we have to set querybuf to NULL *before* to call
1832 * unblockClientWaitingData() to avoid processInputBuffer() will get
1833 * called. Also it is important to remove the file events after
1834 * this, because this call adds the READABLE event. */
1835 sdsfree(c->querybuf);
1836 c->querybuf = NULL;
1837 if (c->flags & REDIS_BLOCKED)
1838 unblockClientWaitingData(c);
1839
1840 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1841 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1842 listRelease(c->reply);
1843 freeClientArgv(c);
1844 close(c->fd);
1845 /* Remove from the list of clients */
1846 ln = listSearchKey(server.clients,c);
1847 redisAssert(ln != NULL);
1848 listDelNode(server.clients,ln);
1849 /* Remove from the list of clients waiting for swapped keys */
1850 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1851 ln = listSearchKey(server.io_ready_clients,c);
1852 if (ln) {
1853 listDelNode(server.io_ready_clients,ln);
1854 server.vm_blocked_clients--;
1855 }
1856 }
1857 while (server.vm_enabled && listLength(c->io_keys)) {
1858 ln = listFirst(c->io_keys);
1859 dontWaitForSwappedKey(c,ln->value);
1860 }
1861 listRelease(c->io_keys);
1862 /* Other cleanup */
1863 if (c->flags & REDIS_SLAVE) {
1864 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1865 close(c->repldbfd);
1866 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1867 ln = listSearchKey(l,c);
1868 redisAssert(ln != NULL);
1869 listDelNode(l,ln);
1870 }
1871 if (c->flags & REDIS_MASTER) {
1872 server.master = NULL;
1873 server.replstate = REDIS_REPL_CONNECT;
1874 }
1875 zfree(c->argv);
1876 zfree(c->mbargv);
1877 freeClientMultiState(c);
1878 zfree(c);
1879 }
1880
1881 #define GLUEREPLY_UP_TO (1024)
1882 static void glueReplyBuffersIfNeeded(redisClient *c) {
1883 int copylen = 0;
1884 char buf[GLUEREPLY_UP_TO];
1885 listNode *ln;
1886 listIter li;
1887 robj *o;
1888
1889 listRewind(c->reply,&li);
1890 while((ln = listNext(&li))) {
1891 int objlen;
1892
1893 o = ln->value;
1894 objlen = sdslen(o->ptr);
1895 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1896 memcpy(buf+copylen,o->ptr,objlen);
1897 copylen += objlen;
1898 listDelNode(c->reply,ln);
1899 } else {
1900 if (copylen == 0) return;
1901 break;
1902 }
1903 }
1904 /* Now the output buffer is empty, add the new single element */
1905 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1906 listAddNodeHead(c->reply,o);
1907 }
1908
1909 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1910 redisClient *c = privdata;
1911 int nwritten = 0, totwritten = 0, objlen;
1912 robj *o;
1913 REDIS_NOTUSED(el);
1914 REDIS_NOTUSED(mask);
1915
1916 /* Use writev() if we have enough buffers to send */
1917 if (!server.glueoutputbuf &&
1918 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1919 !(c->flags & REDIS_MASTER))
1920 {
1921 sendReplyToClientWritev(el, fd, privdata, mask);
1922 return;
1923 }
1924
1925 while(listLength(c->reply)) {
1926 if (server.glueoutputbuf && listLength(c->reply) > 1)
1927 glueReplyBuffersIfNeeded(c);
1928
1929 o = listNodeValue(listFirst(c->reply));
1930 objlen = sdslen(o->ptr);
1931
1932 if (objlen == 0) {
1933 listDelNode(c->reply,listFirst(c->reply));
1934 continue;
1935 }
1936
1937 if (c->flags & REDIS_MASTER) {
1938 /* Don't reply to a master */
1939 nwritten = objlen - c->sentlen;
1940 } else {
1941 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1942 if (nwritten <= 0) break;
1943 }
1944 c->sentlen += nwritten;
1945 totwritten += nwritten;
1946 /* If we fully sent the object on head go to the next one */
1947 if (c->sentlen == objlen) {
1948 listDelNode(c->reply,listFirst(c->reply));
1949 c->sentlen = 0;
1950 }
1951 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1952 * bytes, in a single threaded server it's a good idea to serve
1953 * other clients as well, even if a very large request comes from
1954 * super fast link that is always able to accept data (in real world
1955 * scenario think about 'KEYS *' against the loopback interfae) */
1956 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1957 }
1958 if (nwritten == -1) {
1959 if (errno == EAGAIN) {
1960 nwritten = 0;
1961 } else {
1962 redisLog(REDIS_VERBOSE,
1963 "Error writing to client: %s", strerror(errno));
1964 freeClient(c);
1965 return;
1966 }
1967 }
1968 if (totwritten > 0) c->lastinteraction = time(NULL);
1969 if (listLength(c->reply) == 0) {
1970 c->sentlen = 0;
1971 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1972 }
1973 }
1974
1975 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1976 {
1977 redisClient *c = privdata;
1978 int nwritten = 0, totwritten = 0, objlen, willwrite;
1979 robj *o;
1980 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1981 int offset, ion = 0;
1982 REDIS_NOTUSED(el);
1983 REDIS_NOTUSED(mask);
1984
1985 listNode *node;
1986 while (listLength(c->reply)) {
1987 offset = c->sentlen;
1988 ion = 0;
1989 willwrite = 0;
1990
1991 /* fill-in the iov[] array */
1992 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1993 o = listNodeValue(node);
1994 objlen = sdslen(o->ptr);
1995
1996 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1997 break;
1998
1999 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2000 break; /* no more iovecs */
2001
2002 iov[ion].iov_base = ((char*)o->ptr) + offset;
2003 iov[ion].iov_len = objlen - offset;
2004 willwrite += objlen - offset;
2005 offset = 0; /* just for the first item */
2006 ion++;
2007 }
2008
2009 if(willwrite == 0)
2010 break;
2011
2012 /* write all collected blocks at once */
2013 if((nwritten = writev(fd, iov, ion)) < 0) {
2014 if (errno != EAGAIN) {
2015 redisLog(REDIS_VERBOSE,
2016 "Error writing to client: %s", strerror(errno));
2017 freeClient(c);
2018 return;
2019 }
2020 break;
2021 }
2022
2023 totwritten += nwritten;
2024 offset = c->sentlen;
2025
2026 /* remove written robjs from c->reply */
2027 while (nwritten && listLength(c->reply)) {
2028 o = listNodeValue(listFirst(c->reply));
2029 objlen = sdslen(o->ptr);
2030
2031 if(nwritten >= objlen - offset) {
2032 listDelNode(c->reply, listFirst(c->reply));
2033 nwritten -= objlen - offset;
2034 c->sentlen = 0;
2035 } else {
2036 /* partial write */
2037 c->sentlen += nwritten;
2038 break;
2039 }
2040 offset = 0;
2041 }
2042 }
2043
2044 if (totwritten > 0)
2045 c->lastinteraction = time(NULL);
2046
2047 if (listLength(c->reply) == 0) {
2048 c->sentlen = 0;
2049 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2050 }
2051 }
2052
2053 static struct redisCommand *lookupCommand(char *name) {
2054 int j = 0;
2055 while(cmdTable[j].name != NULL) {
2056 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2057 j++;
2058 }
2059 return NULL;
2060 }
2061
2062 /* resetClient prepare the client to process the next command */
2063 static void resetClient(redisClient *c) {
2064 freeClientArgv(c);
2065 c->bulklen = -1;
2066 c->multibulk = 0;
2067 }
2068
2069 /* Call() is the core of Redis execution of a command */
2070 static void call(redisClient *c, struct redisCommand *cmd) {
2071 long long dirty;
2072
2073 dirty = server.dirty;
2074 cmd->proc(c);
2075 if (server.appendonly && server.dirty-dirty)
2076 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2077 if (server.dirty-dirty && listLength(server.slaves))
2078 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2079 if (listLength(server.monitors))
2080 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2081 server.stat_numcommands++;
2082 }
2083
2084 /* If this function gets called we already read a whole
2085 * command, argments are in the client argv/argc fields.
2086 * processCommand() execute the command or prepare the
2087 * server for a bulk read from the client.
2088 *
2089 * If 1 is returned the client is still alive and valid and
2090 * and other operations can be performed by the caller. Otherwise
2091 * if 0 is returned the client was destroied (i.e. after QUIT). */
2092 static int processCommand(redisClient *c) {
2093 struct redisCommand *cmd;
2094
2095 /* Free some memory if needed (maxmemory setting) */
2096 if (server.maxmemory) freeMemoryIfNeeded();
2097
2098 /* Handle the multi bulk command type. This is an alternative protocol
2099 * supported by Redis in order to receive commands that are composed of
2100 * multiple binary-safe "bulk" arguments. The latency of processing is
2101 * a bit higher but this allows things like multi-sets, so if this
2102 * protocol is used only for MSET and similar commands this is a big win. */
2103 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2104 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2105 if (c->multibulk <= 0) {
2106 resetClient(c);
2107 return 1;
2108 } else {
2109 decrRefCount(c->argv[c->argc-1]);
2110 c->argc--;
2111 return 1;
2112 }
2113 } else if (c->multibulk) {
2114 if (c->bulklen == -1) {
2115 if (((char*)c->argv[0]->ptr)[0] != '$') {
2116 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2117 resetClient(c);
2118 return 1;
2119 } else {
2120 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2121 decrRefCount(c->argv[0]);
2122 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2123 c->argc--;
2124 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2125 resetClient(c);
2126 return 1;
2127 }
2128 c->argc--;
2129 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2130 return 1;
2131 }
2132 } else {
2133 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2134 c->mbargv[c->mbargc] = c->argv[0];
2135 c->mbargc++;
2136 c->argc--;
2137 c->multibulk--;
2138 if (c->multibulk == 0) {
2139 robj **auxargv;
2140 int auxargc;
2141
2142 /* Here we need to swap the multi-bulk argc/argv with the
2143 * normal argc/argv of the client structure. */
2144 auxargv = c->argv;
2145 c->argv = c->mbargv;
2146 c->mbargv = auxargv;
2147
2148 auxargc = c->argc;
2149 c->argc = c->mbargc;
2150 c->mbargc = auxargc;
2151
2152 /* We need to set bulklen to something different than -1
2153 * in order for the code below to process the command without
2154 * to try to read the last argument of a bulk command as
2155 * a special argument. */
2156 c->bulklen = 0;
2157 /* continue below and process the command */
2158 } else {
2159 c->bulklen = -1;
2160 return 1;
2161 }
2162 }
2163 }
2164 /* -- end of multi bulk commands processing -- */
2165
2166 /* The QUIT command is handled as a special case. Normal command
2167 * procs are unable to close the client connection safely */
2168 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2169 freeClient(c);
2170 return 0;
2171 }
2172
2173 /* Now lookup the command and check ASAP about trivial error conditions
2174 * such wrong arity, bad command name and so forth. */
2175 cmd = lookupCommand(c->argv[0]->ptr);
2176 if (!cmd) {
2177 addReplySds(c,
2178 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2179 (char*)c->argv[0]->ptr));
2180 resetClient(c);
2181 return 1;
2182 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2183 (c->argc < -cmd->arity)) {
2184 addReplySds(c,
2185 sdscatprintf(sdsempty(),
2186 "-ERR wrong number of arguments for '%s' command\r\n",
2187 cmd->name));
2188 resetClient(c);
2189 return 1;
2190 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2191 /* This is a bulk command, we have to read the last argument yet. */
2192 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2193
2194 decrRefCount(c->argv[c->argc-1]);
2195 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2196 c->argc--;
2197 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2198 resetClient(c);
2199 return 1;
2200 }
2201 c->argc--;
2202 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2203 /* It is possible that the bulk read is already in the
2204 * buffer. Check this condition and handle it accordingly.
2205 * This is just a fast path, alternative to call processInputBuffer().
2206 * It's a good idea since the code is small and this condition
2207 * happens most of the times. */
2208 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2209 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2210 c->argc++;
2211 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2212 } else {
2213 /* Otherwise return... there is to read the last argument
2214 * from the socket. */
2215 return 1;
2216 }
2217 }
2218 /* Let's try to share objects on the command arguments vector */
2219 if (server.shareobjects) {
2220 int j;
2221 for(j = 1; j < c->argc; j++)
2222 c->argv[j] = tryObjectSharing(c->argv[j]);
2223 }
2224 /* Let's try to encode the bulk object to save space. */
2225 if (cmd->flags & REDIS_CMD_BULK)
2226 tryObjectEncoding(c->argv[c->argc-1]);
2227
2228 /* Check if the user is authenticated */
2229 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2230 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2231 resetClient(c);
2232 return 1;
2233 }
2234
2235 /* Handle the maxmemory directive */
2236 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2237 zmalloc_used_memory() > server.maxmemory)
2238 {
2239 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2240 resetClient(c);
2241 return 1;
2242 }
2243
2244 /* Exec the command */
2245 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2246 queueMultiCommand(c,cmd);
2247 addReply(c,shared.queued);
2248 } else {
2249 if (server.vm_enabled && server.vm_max_threads > 0 &&
2250 blockClientOnSwappedKeys(cmd,c)) return 1;
2251 call(c,cmd);
2252 }
2253
2254 /* Prepare the client for the next command */
2255 resetClient(c);
2256 return 1;
2257 }
2258
2259 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2260 listNode *ln;
2261 listIter li;
2262 int outc = 0, j;
2263 robj **outv;
2264 /* (args*2)+1 is enough room for args, spaces, newlines */
2265 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2266
2267 if (argc <= REDIS_STATIC_ARGS) {
2268 outv = static_outv;
2269 } else {
2270 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2271 }
2272
2273 for (j = 0; j < argc; j++) {
2274 if (j != 0) outv[outc++] = shared.space;
2275 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2276 robj *lenobj;
2277
2278 lenobj = createObject(REDIS_STRING,
2279 sdscatprintf(sdsempty(),"%lu\r\n",
2280 (unsigned long) stringObjectLen(argv[j])));
2281 lenobj->refcount = 0;
2282 outv[outc++] = lenobj;
2283 }
2284 outv[outc++] = argv[j];
2285 }
2286 outv[outc++] = shared.crlf;
2287
2288 /* Increment all the refcounts at start and decrement at end in order to
2289 * be sure to free objects if there is no slave in a replication state
2290 * able to be feed with commands */
2291 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2292 listRewind(slaves,&li);
2293 while((ln = listNext(&li))) {
2294 redisClient *slave = ln->value;
2295
2296 /* Don't feed slaves that are still waiting for BGSAVE to start */
2297 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2298
2299 /* Feed all the other slaves, MONITORs and so on */
2300 if (slave->slaveseldb != dictid) {
2301 robj *selectcmd;
2302
2303 switch(dictid) {
2304 case 0: selectcmd = shared.select0; break;
2305 case 1: selectcmd = shared.select1; break;
2306 case 2: selectcmd = shared.select2; break;
2307 case 3: selectcmd = shared.select3; break;
2308 case 4: selectcmd = shared.select4; break;
2309 case 5: selectcmd = shared.select5; break;
2310 case 6: selectcmd = shared.select6; break;
2311 case 7: selectcmd = shared.select7; break;
2312 case 8: selectcmd = shared.select8; break;
2313 case 9: selectcmd = shared.select9; break;
2314 default:
2315 selectcmd = createObject(REDIS_STRING,
2316 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2317 selectcmd->refcount = 0;
2318 break;
2319 }
2320 addReply(slave,selectcmd);
2321 slave->slaveseldb = dictid;
2322 }
2323 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2324 }
2325 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2326 if (outv != static_outv) zfree(outv);
2327 }
2328
2329 static void processInputBuffer(redisClient *c) {
2330 again:
2331 /* Before to process the input buffer, make sure the client is not
2332 * waitig for a blocking operation such as BLPOP. Note that the first
2333 * iteration the client is never blocked, otherwise the processInputBuffer
2334 * would not be called at all, but after the execution of the first commands
2335 * in the input buffer the client may be blocked, and the "goto again"
2336 * will try to reiterate. The following line will make it return asap. */
2337 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2338 if (c->bulklen == -1) {
2339 /* Read the first line of the query */
2340 char *p = strchr(c->querybuf,'\n');
2341 size_t querylen;
2342
2343 if (p) {
2344 sds query, *argv;
2345 int argc, j;
2346
2347 query = c->querybuf;
2348 c->querybuf = sdsempty();
2349 querylen = 1+(p-(query));
2350 if (sdslen(query) > querylen) {
2351 /* leave data after the first line of the query in the buffer */
2352 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2353 }
2354 *p = '\0'; /* remove "\n" */
2355 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2356 sdsupdatelen(query);
2357
2358 /* Now we can split the query in arguments */
2359 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2360 sdsfree(query);
2361
2362 if (c->argv) zfree(c->argv);
2363 c->argv = zmalloc(sizeof(robj*)*argc);
2364
2365 for (j = 0; j < argc; j++) {
2366 if (sdslen(argv[j])) {
2367 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2368 c->argc++;
2369 } else {
2370 sdsfree(argv[j]);
2371 }
2372 }
2373 zfree(argv);
2374 if (c->argc) {
2375 /* Execute the command. If the client is still valid
2376 * after processCommand() return and there is something
2377 * on the query buffer try to process the next command. */
2378 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2379 } else {
2380 /* Nothing to process, argc == 0. Just process the query
2381 * buffer if it's not empty or return to the caller */
2382 if (sdslen(c->querybuf)) goto again;
2383 }
2384 return;
2385 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2386 redisLog(REDIS_VERBOSE, "Client protocol error");
2387 freeClient(c);
2388 return;
2389 }
2390 } else {
2391 /* Bulk read handling. Note that if we are at this point
2392 the client already sent a command terminated with a newline,
2393 we are reading the bulk data that is actually the last
2394 argument of the command. */
2395 int qbl = sdslen(c->querybuf);
2396
2397 if (c->bulklen <= qbl) {
2398 /* Copy everything but the final CRLF as final argument */
2399 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2400 c->argc++;
2401 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2402 /* Process the command. If the client is still valid after
2403 * the processing and there is more data in the buffer
2404 * try to parse it. */
2405 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2406 return;
2407 }
2408 }
2409 }
2410
2411 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2412 redisClient *c = (redisClient*) privdata;
2413 char buf[REDIS_IOBUF_LEN];
2414 int nread;
2415 REDIS_NOTUSED(el);
2416 REDIS_NOTUSED(mask);
2417
2418 nread = read(fd, buf, REDIS_IOBUF_LEN);
2419 if (nread == -1) {
2420 if (errno == EAGAIN) {
2421 nread = 0;
2422 } else {
2423 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2424 freeClient(c);
2425 return;
2426 }
2427 } else if (nread == 0) {
2428 redisLog(REDIS_VERBOSE, "Client closed connection");
2429 freeClient(c);
2430 return;
2431 }
2432 if (nread) {
2433 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2434 c->lastinteraction = time(NULL);
2435 } else {
2436 return;
2437 }
2438 if (!(c->flags & REDIS_BLOCKED))
2439 processInputBuffer(c);
2440 }
2441
2442 static int selectDb(redisClient *c, int id) {
2443 if (id < 0 || id >= server.dbnum)
2444 return REDIS_ERR;
2445 c->db = &server.db[id];
2446 return REDIS_OK;
2447 }
2448
2449 static void *dupClientReplyValue(void *o) {
2450 incrRefCount((robj*)o);
2451 return o;
2452 }
2453
2454 static redisClient *createClient(int fd) {
2455 redisClient *c = zmalloc(sizeof(*c));
2456
2457 anetNonBlock(NULL,fd);
2458 anetTcpNoDelay(NULL,fd);
2459 if (!c) return NULL;
2460 selectDb(c,0);
2461 c->fd = fd;
2462 c->querybuf = sdsempty();
2463 c->argc = 0;
2464 c->argv = NULL;
2465 c->bulklen = -1;
2466 c->multibulk = 0;
2467 c->mbargc = 0;
2468 c->mbargv = NULL;
2469 c->sentlen = 0;
2470 c->flags = 0;
2471 c->lastinteraction = time(NULL);
2472 c->authenticated = 0;
2473 c->replstate = REDIS_REPL_NONE;
2474 c->reply = listCreate();
2475 listSetFreeMethod(c->reply,decrRefCount);
2476 listSetDupMethod(c->reply,dupClientReplyValue);
2477 c->blockingkeys = NULL;
2478 c->blockingkeysnum = 0;
2479 c->io_keys = listCreate();
2480 listSetFreeMethod(c->io_keys,decrRefCount);
2481 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2482 readQueryFromClient, c) == AE_ERR) {
2483 freeClient(c);
2484 return NULL;
2485 }
2486 listAddNodeTail(server.clients,c);
2487 initClientMultiState(c);
2488 return c;
2489 }
2490
2491 static void addReply(redisClient *c, robj *obj) {
2492 if (listLength(c->reply) == 0 &&
2493 (c->replstate == REDIS_REPL_NONE ||
2494 c->replstate == REDIS_REPL_ONLINE) &&
2495 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2496 sendReplyToClient, c) == AE_ERR) return;
2497
2498 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2499 obj = dupStringObject(obj);
2500 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2501 }
2502 listAddNodeTail(c->reply,getDecodedObject(obj));
2503 }
2504
2505 static void addReplySds(redisClient *c, sds s) {
2506 robj *o = createObject(REDIS_STRING,s);
2507 addReply(c,o);
2508 decrRefCount(o);
2509 }
2510
2511 static void addReplyDouble(redisClient *c, double d) {
2512 char buf[128];
2513
2514 snprintf(buf,sizeof(buf),"%.17g",d);
2515 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2516 (unsigned long) strlen(buf),buf));
2517 }
2518
2519 static void addReplyLong(redisClient *c, long l) {
2520 char buf[128];
2521 size_t len;
2522
2523 if (l == 0) {
2524 addReply(c,shared.czero);
2525 return;
2526 } else if (l == 1) {
2527 addReply(c,shared.cone);
2528 return;
2529 }
2530 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2531 addReplySds(c,sdsnewlen(buf,len));
2532 }
2533
2534 static void addReplyUlong(redisClient *c, unsigned long ul) {
2535 char buf[128];
2536 size_t len;
2537
2538 if (ul == 0) {
2539 addReply(c,shared.czero);
2540 return;
2541 } else if (ul == 1) {
2542 addReply(c,shared.cone);
2543 return;
2544 }
2545 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2546 addReplySds(c,sdsnewlen(buf,len));
2547 }
2548
2549 static void addReplyBulkLen(redisClient *c, robj *obj) {
2550 size_t len;
2551
2552 if (obj->encoding == REDIS_ENCODING_RAW) {
2553 len = sdslen(obj->ptr);
2554 } else {
2555 long n = (long)obj->ptr;
2556
2557 /* Compute how many bytes will take this integer as a radix 10 string */
2558 len = 1;
2559 if (n < 0) {
2560 len++;
2561 n = -n;
2562 }
2563 while((n = n/10) != 0) {
2564 len++;
2565 }
2566 }
2567 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2568 }
2569
2570 static void addReplyBulk(redisClient *c, robj *obj) {
2571 addReplyBulkLen(c,obj);
2572 addReply(c,obj);
2573 addReply(c,shared.crlf);
2574 }
2575
2576 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2577 static void addReplyBulkCString(redisClient *c, char *s) {
2578 if (s == NULL) {
2579 addReply(c,shared.nullbulk);
2580 } else {
2581 robj *o = createStringObject(s,strlen(s));
2582 addReplyBulk(c,o);
2583 decrRefCount(o);
2584 }
2585 }
2586
2587 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2588 int cport, cfd;
2589 char cip[128];
2590 redisClient *c;
2591 REDIS_NOTUSED(el);
2592 REDIS_NOTUSED(mask);
2593 REDIS_NOTUSED(privdata);
2594
2595 cfd = anetAccept(server.neterr, fd, cip, &cport);
2596 if (cfd == AE_ERR) {
2597 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2598 return;
2599 }
2600 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2601 if ((c = createClient(cfd)) == NULL) {
2602 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2603 close(cfd); /* May be already closed, just ingore errors */
2604 return;
2605 }
2606 /* If maxclient directive is set and this is one client more... close the
2607 * connection. Note that we create the client instead to check before
2608 * for this condition, since now the socket is already set in nonblocking
2609 * mode and we can send an error for free using the Kernel I/O */
2610 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2611 char *err = "-ERR max number of clients reached\r\n";
2612
2613 /* That's a best effort error message, don't check write errors */
2614 if (write(c->fd,err,strlen(err)) == -1) {
2615 /* Nothing to do, Just to avoid the warning... */
2616 }
2617 freeClient(c);
2618 return;
2619 }
2620 server.stat_numconnections++;
2621 }
2622
2623 /* ======================= Redis objects implementation ===================== */
2624
2625 static robj *createObject(int type, void *ptr) {
2626 robj *o;
2627
2628 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2629 if (listLength(server.objfreelist)) {
2630 listNode *head = listFirst(server.objfreelist);
2631 o = listNodeValue(head);
2632 listDelNode(server.objfreelist,head);
2633 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2634 } else {
2635 if (server.vm_enabled) {
2636 pthread_mutex_unlock(&server.obj_freelist_mutex);
2637 o = zmalloc(sizeof(*o));
2638 } else {
2639 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2640 }
2641 }
2642 o->type = type;
2643 o->encoding = REDIS_ENCODING_RAW;
2644 o->ptr = ptr;
2645 o->refcount = 1;
2646 if (server.vm_enabled) {
2647 /* Note that this code may run in the context of an I/O thread
2648 * and accessing to server.unixtime in theory is an error
2649 * (no locks). But in practice this is safe, and even if we read
2650 * garbage Redis will not fail, as it's just a statistical info */
2651 o->vm.atime = server.unixtime;
2652 o->storage = REDIS_VM_MEMORY;
2653 }
2654 return o;
2655 }
2656
2657 static robj *createStringObject(char *ptr, size_t len) {
2658 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2659 }
2660
2661 static robj *dupStringObject(robj *o) {
2662 assert(o->encoding == REDIS_ENCODING_RAW);
2663 return createStringObject(o->ptr,sdslen(o->ptr));
2664 }
2665
2666 static robj *createListObject(void) {
2667 list *l = listCreate();
2668
2669 listSetFreeMethod(l,decrRefCount);
2670 return createObject(REDIS_LIST,l);
2671 }
2672
2673 static robj *createSetObject(void) {
2674 dict *d = dictCreate(&setDictType,NULL);
2675 return createObject(REDIS_SET,d);
2676 }
2677
2678 static robj *createHashObject(void) {
2679 /* All the Hashes start as zipmaps. Will be automatically converted
2680 * into hash tables if there are enough elements or big elements
2681 * inside. */
2682 unsigned char *zm = zipmapNew();
2683 robj *o = createObject(REDIS_HASH,zm);
2684 o->encoding = REDIS_ENCODING_ZIPMAP;
2685 return o;
2686 }
2687
2688 static robj *createZsetObject(void) {
2689 zset *zs = zmalloc(sizeof(*zs));
2690
2691 zs->dict = dictCreate(&zsetDictType,NULL);
2692 zs->zsl = zslCreate();
2693 return createObject(REDIS_ZSET,zs);
2694 }
2695
2696 static void freeStringObject(robj *o) {
2697 if (o->encoding == REDIS_ENCODING_RAW) {
2698 sdsfree(o->ptr);
2699 }
2700 }
2701
2702 static void freeListObject(robj *o) {
2703 listRelease((list*) o->ptr);
2704 }
2705
2706 static void freeSetObject(robj *o) {
2707 dictRelease((dict*) o->ptr);
2708 }
2709
2710 static void freeZsetObject(robj *o) {
2711 zset *zs = o->ptr;
2712
2713 dictRelease(zs->dict);
2714 zslFree(zs->zsl);
2715 zfree(zs);
2716 }
2717
2718 static void freeHashObject(robj *o) {
2719 switch (o->encoding) {
2720 case REDIS_ENCODING_HT:
2721 dictRelease((dict*) o->ptr);
2722 break;
2723 case REDIS_ENCODING_ZIPMAP:
2724 zfree(o->ptr);
2725 break;
2726 default:
2727 redisAssert(0);
2728 break;
2729 }
2730 }
2731
2732 static void incrRefCount(robj *o) {
2733 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2734 o->refcount++;
2735 }
2736
2737 static void decrRefCount(void *obj) {
2738 robj *o = obj;
2739
2740 /* Object is a key of a swapped out value, or in the process of being
2741 * loaded. */
2742 if (server.vm_enabled &&
2743 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2744 {
2745 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2746 redisAssert(o->refcount == 1);
2747 }
2748 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2749 redisAssert(o->type == REDIS_STRING);
2750 freeStringObject(o);
2751 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2752 pthread_mutex_lock(&server.obj_freelist_mutex);
2753 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2754 !listAddNodeHead(server.objfreelist,o))
2755 zfree(o);
2756 pthread_mutex_unlock(&server.obj_freelist_mutex);
2757 server.vm_stats_swapped_objects--;
2758 return;
2759 }
2760 /* Object is in memory, or in the process of being swapped out. */
2761 if (--(o->refcount) == 0) {
2762 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2763 vmCancelThreadedIOJob(obj);
2764 switch(o->type) {
2765 case REDIS_STRING: freeStringObject(o); break;
2766 case REDIS_LIST: freeListObject(o); break;
2767 case REDIS_SET: freeSetObject(o); break;
2768 case REDIS_ZSET: freeZsetObject(o); break;
2769 case REDIS_HASH: freeHashObject(o); break;
2770 default: redisAssert(0); break;
2771 }
2772 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2773 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2774 !listAddNodeHead(server.objfreelist,o))
2775 zfree(o);
2776 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2777 }
2778 }
2779
2780 static robj *lookupKey(redisDb *db, robj *key) {
2781 dictEntry *de = dictFind(db->dict,key);
2782 if (de) {
2783 robj *key = dictGetEntryKey(de);
2784 robj *val = dictGetEntryVal(de);
2785
2786 if (server.vm_enabled) {
2787 if (key->storage == REDIS_VM_MEMORY ||
2788 key->storage == REDIS_VM_SWAPPING)
2789 {
2790 /* If we were swapping the object out, stop it, this key
2791 * was requested. */
2792 if (key->storage == REDIS_VM_SWAPPING)
2793 vmCancelThreadedIOJob(key);
2794 /* Update the access time of the key for the aging algorithm. */
2795 key->vm.atime = server.unixtime;
2796 } else {
2797 int notify = (key->storage == REDIS_VM_LOADING);
2798
2799 /* Our value was swapped on disk. Bring it at home. */
2800 redisAssert(val == NULL);
2801 val = vmLoadObject(key);
2802 dictGetEntryVal(de) = val;
2803
2804 /* Clients blocked by the VM subsystem may be waiting for
2805 * this key... */
2806 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2807 }
2808 }
2809 return val;
2810 } else {
2811 return NULL;
2812 }
2813 }
2814
2815 static robj *lookupKeyRead(redisDb *db, robj *key) {
2816 expireIfNeeded(db,key);
2817 return lookupKey(db,key);
2818 }
2819
2820 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2821 deleteIfVolatile(db,key);
2822 return lookupKey(db,key);
2823 }
2824
2825 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2826 robj *o = lookupKeyRead(c->db, key);
2827 if (!o) addReply(c,reply);
2828 return o;
2829 }
2830
2831 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2832 robj *o = lookupKeyWrite(c->db, key);
2833 if (!o) addReply(c,reply);
2834 return o;
2835 }
2836
2837 static int checkType(redisClient *c, robj *o, int type) {
2838 if (o->type != type) {
2839 addReply(c,shared.wrongtypeerr);
2840 return 1;
2841 }
2842 return 0;
2843 }
2844
2845 static int deleteKey(redisDb *db, robj *key) {
2846 int retval;
2847
2848 /* We need to protect key from destruction: after the first dictDelete()
2849 * it may happen that 'key' is no longer valid if we don't increment
2850 * it's count. This may happen when we get the object reference directly
2851 * from the hash table with dictRandomKey() or dict iterators */
2852 incrRefCount(key);
2853 if (dictSize(db->expires)) dictDelete(db->expires,key);
2854 retval = dictDelete(db->dict,key);
2855 decrRefCount(key);
2856
2857 return retval == DICT_OK;
2858 }
2859
2860 /* Try to share an object against the shared objects pool */
2861 static robj *tryObjectSharing(robj *o) {
2862 struct dictEntry *de;
2863 unsigned long c;
2864
2865 if (o == NULL || server.shareobjects == 0) return o;
2866
2867 redisAssert(o->type == REDIS_STRING);
2868 de = dictFind(server.sharingpool,o);
2869 if (de) {
2870 robj *shared = dictGetEntryKey(de);
2871
2872 c = ((unsigned long) dictGetEntryVal(de))+1;
2873 dictGetEntryVal(de) = (void*) c;
2874 incrRefCount(shared);
2875 decrRefCount(o);
2876 return shared;
2877 } else {
2878 /* Here we are using a stream algorihtm: Every time an object is
2879 * shared we increment its count, everytime there is a miss we
2880 * recrement the counter of a random object. If this object reaches
2881 * zero we remove the object and put the current object instead. */
2882 if (dictSize(server.sharingpool) >=
2883 server.sharingpoolsize) {
2884 de = dictGetRandomKey(server.sharingpool);
2885 redisAssert(de != NULL);
2886 c = ((unsigned long) dictGetEntryVal(de))-1;
2887 dictGetEntryVal(de) = (void*) c;
2888 if (c == 0) {
2889 dictDelete(server.sharingpool,de->key);
2890 }
2891 } else {
2892 c = 0; /* If the pool is empty we want to add this object */
2893 }
2894 if (c == 0) {
2895 int retval;
2896
2897 retval = dictAdd(server.sharingpool,o,(void*)1);
2898 redisAssert(retval == DICT_OK);
2899 incrRefCount(o);
2900 }
2901 return o;
2902 }
2903 }
2904
2905 /* Check if the nul-terminated string 's' can be represented by a long
2906 * (that is, is a number that fits into long without any other space or
2907 * character before or after the digits).
2908 *
2909 * If so, the function returns REDIS_OK and *longval is set to the value
2910 * of the number. Otherwise REDIS_ERR is returned */
2911 static int isStringRepresentableAsLong(sds s, long *longval) {
2912 char buf[32], *endptr;
2913 long value;
2914 int slen;
2915
2916 value = strtol(s, &endptr, 10);
2917 if (endptr[0] != '\0') return REDIS_ERR;
2918 slen = snprintf(buf,32,"%ld",value);
2919
2920 /* If the number converted back into a string is not identical
2921 * then it's not possible to encode the string as integer */
2922 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2923 if (longval) *longval = value;
2924 return REDIS_OK;
2925 }
2926
2927 /* Try to encode a string object in order to save space */
2928 static int tryObjectEncoding(robj *o) {
2929 long value;
2930 sds s = o->ptr;
2931
2932 if (o->encoding != REDIS_ENCODING_RAW)
2933 return REDIS_ERR; /* Already encoded */
2934
2935 /* It's not save to encode shared objects: shared objects can be shared
2936 * everywhere in the "object space" of Redis. Encoded objects can only
2937 * appear as "values" (and not, for instance, as keys) */
2938 if (o->refcount > 1) return REDIS_ERR;
2939
2940 /* Currently we try to encode only strings */
2941 redisAssert(o->type == REDIS_STRING);
2942
2943 /* Check if we can represent this string as a long integer */
2944 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2945
2946 /* Ok, this object can be encoded */
2947 o->encoding = REDIS_ENCODING_INT;
2948 sdsfree(o->ptr);
2949 o->ptr = (void*) value;
2950 return REDIS_OK;
2951 }
2952
2953 /* Get a decoded version of an encoded object (returned as a new object).
2954 * If the object is already raw-encoded just increment the ref count. */
2955 static robj *getDecodedObject(robj *o) {
2956 robj *dec;
2957
2958 if (o->encoding == REDIS_ENCODING_RAW) {
2959 incrRefCount(o);
2960 return o;
2961 }
2962 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2963 char buf[32];
2964
2965 snprintf(buf,32,"%ld",(long)o->ptr);
2966 dec = createStringObject(buf,strlen(buf));
2967 return dec;
2968 } else {
2969 redisAssert(1 != 1);
2970 }
2971 }
2972
2973 /* Compare two string objects via strcmp() or alike.
2974 * Note that the objects may be integer-encoded. In such a case we
2975 * use snprintf() to get a string representation of the numbers on the stack
2976 * and compare the strings, it's much faster than calling getDecodedObject().
2977 *
2978 * Important note: if objects are not integer encoded, but binary-safe strings,
2979 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2980 * binary safe. */
2981 static int compareStringObjects(robj *a, robj *b) {
2982 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2983 char bufa[128], bufb[128], *astr, *bstr;
2984 int bothsds = 1;
2985
2986 if (a == b) return 0;
2987 if (a->encoding != REDIS_ENCODING_RAW) {
2988 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2989 astr = bufa;
2990 bothsds = 0;
2991 } else {
2992 astr = a->ptr;
2993 }
2994 if (b->encoding != REDIS_ENCODING_RAW) {
2995 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2996 bstr = bufb;
2997 bothsds = 0;
2998 } else {
2999 bstr = b->ptr;
3000 }
3001 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3002 }
3003
3004 static size_t stringObjectLen(robj *o) {
3005 redisAssert(o->type == REDIS_STRING);
3006 if (o->encoding == REDIS_ENCODING_RAW) {
3007 return sdslen(o->ptr);
3008 } else {
3009 char buf[32];
3010
3011 return snprintf(buf,32,"%ld",(long)o->ptr);
3012 }
3013 }
3014
3015 /*============================ RDB saving/loading =========================== */
3016
3017 static int rdbSaveType(FILE *fp, unsigned char type) {
3018 if (fwrite(&type,1,1,fp) == 0) return -1;
3019 return 0;
3020 }
3021
3022 static int rdbSaveTime(FILE *fp, time_t t) {
3023 int32_t t32 = (int32_t) t;
3024 if (fwrite(&t32,4,1,fp) == 0) return -1;
3025 return 0;
3026 }
3027
3028 /* check rdbLoadLen() comments for more info */
3029 static int rdbSaveLen(FILE *fp, uint32_t len) {
3030 unsigned char buf[2];
3031
3032 if (len < (1<<6)) {
3033 /* Save a 6 bit len */
3034 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3035 if (fwrite(buf,1,1,fp) == 0) return -1;
3036 } else if (len < (1<<14)) {
3037 /* Save a 14 bit len */
3038 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3039 buf[1] = len&0xFF;
3040 if (fwrite(buf,2,1,fp) == 0) return -1;
3041 } else {
3042 /* Save a 32 bit len */
3043 buf[0] = (REDIS_RDB_32BITLEN<<6);
3044 if (fwrite(buf,1,1,fp) == 0) return -1;
3045 len = htonl(len);
3046 if (fwrite(&len,4,1,fp) == 0) return -1;
3047 }
3048 return 0;
3049 }
3050
3051 /* String objects in the form "2391" "-100" without any space and with a
3052 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3053 * encoded as integers to save space */
3054 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3055 long long value;
3056 char *endptr, buf[32];
3057
3058 /* Check if it's possible to encode this value as a number */
3059 value = strtoll(s, &endptr, 10);
3060 if (endptr[0] != '\0') return 0;
3061 snprintf(buf,32,"%lld",value);
3062
3063 /* If the number converted back into a string is not identical
3064 * then it's not possible to encode the string as integer */
3065 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3066
3067 /* Finally check if it fits in our ranges */
3068 if (value >= -(1<<7) && value <= (1<<7)-1) {
3069 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3070 enc[1] = value&0xFF;
3071 return 2;
3072 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3073 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3074 enc[1] = value&0xFF;
3075 enc[2] = (value>>8)&0xFF;
3076 return 3;
3077 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3078 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3079 enc[1] = value&0xFF;
3080 enc[2] = (value>>8)&0xFF;
3081 enc[3] = (value>>16)&0xFF;
3082 enc[4] = (value>>24)&0xFF;
3083 return 5;
3084 } else {
3085 return 0;
3086 }
3087 }
3088
3089 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3090 size_t comprlen, outlen;
3091 unsigned char byte;
3092 void *out;
3093
3094 /* We require at least four bytes compression for this to be worth it */
3095 if (len <= 4) return 0;
3096 outlen = len-4;
3097 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3098 comprlen = lzf_compress(s, len, out, outlen);
3099 if (comprlen == 0) {
3100 zfree(out);
3101 return 0;
3102 }
3103 /* Data compressed! Let's save it on disk */
3104 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3105 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3106 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3107 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3108 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3109 zfree(out);
3110 return comprlen;
3111
3112 writeerr:
3113 zfree(out);
3114 return -1;
3115 }
3116
3117 /* Save a string objet as [len][data] on disk. If the object is a string
3118 * representation of an integer value we try to safe it in a special form */
3119 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3120 int enclen;
3121
3122 /* Try integer encoding */
3123 if (len <= 11) {
3124 unsigned char buf[5];
3125 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3126 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3127 return 0;
3128 }
3129 }
3130
3131 /* Try LZF compression - under 20 bytes it's unable to compress even
3132 * aaaaaaaaaaaaaaaaaa so skip it */
3133 if (server.rdbcompression && len > 20) {
3134 int retval;
3135
3136 retval = rdbSaveLzfStringObject(fp,s,len);
3137 if (retval == -1) return -1;
3138 if (retval > 0) return 0;
3139 /* retval == 0 means data can't be compressed, save the old way */
3140 }
3141
3142 /* Store verbatim */
3143 if (rdbSaveLen(fp,len) == -1) return -1;
3144 if (len && fwrite(s,len,1,fp) == 0) return -1;
3145 return 0;
3146 }
3147
3148 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3149 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3150 int retval;
3151
3152 /* Avoid incr/decr ref count business when possible.
3153 * This plays well with copy-on-write given that we are probably
3154 * in a child process (BGSAVE). Also this makes sure key objects
3155 * of swapped objects are not incRefCount-ed (an assert does not allow
3156 * this in order to avoid bugs) */
3157 if (obj->encoding != REDIS_ENCODING_RAW) {
3158 obj = getDecodedObject(obj);
3159 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3160 decrRefCount(obj);
3161 } else {
3162 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3163 }
3164 return retval;
3165 }
3166
3167 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3168 * 8 bit integer specifing the length of the representation.
3169 * This 8 bit integer has special values in order to specify the following
3170 * conditions:
3171 * 253: not a number
3172 * 254: + inf
3173 * 255: - inf
3174 */
3175 static int rdbSaveDoubleValue(FILE *fp, double val) {
3176 unsigned char buf[128];
3177 int len;
3178
3179 if (isnan(val)) {
3180 buf[0] = 253;
3181 len = 1;
3182 } else if (!isfinite(val)) {
3183 len = 1;
3184 buf[0] = (val < 0) ? 255 : 254;
3185 } else {
3186 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3187 buf[0] = strlen((char*)buf+1);
3188 len = buf[0]+1;
3189 }
3190 if (fwrite(buf,len,1,fp) == 0) return -1;
3191 return 0;
3192 }
3193
3194 /* Save a Redis object. */
3195 static int rdbSaveObject(FILE *fp, robj *o) {
3196 if (o->type == REDIS_STRING) {
3197 /* Save a string value */
3198 if (rdbSaveStringObject(fp,o) == -1) return -1;
3199 } else if (o->type == REDIS_LIST) {
3200 /* Save a list value */
3201 list *list = o->ptr;
3202 listIter li;
3203 listNode *ln;
3204
3205 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3206 listRewind(list,&li);
3207 while((ln = listNext(&li))) {
3208 robj *eleobj = listNodeValue(ln);
3209
3210 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3211 }
3212 } else if (o->type == REDIS_SET) {
3213 /* Save a set value */
3214 dict *set = o->ptr;
3215 dictIterator *di = dictGetIterator(set);
3216 dictEntry *de;
3217
3218 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3219 while((de = dictNext(di)) != NULL) {
3220 robj *eleobj = dictGetEntryKey(de);
3221
3222 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3223 }
3224 dictReleaseIterator(di);
3225 } else if (o->type == REDIS_ZSET) {
3226 /* Save a set value */
3227 zset *zs = o->ptr;
3228 dictIterator *di = dictGetIterator(zs->dict);
3229 dictEntry *de;
3230
3231 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3232 while((de = dictNext(di)) != NULL) {
3233 robj *eleobj = dictGetEntryKey(de);
3234 double *score = dictGetEntryVal(de);
3235
3236 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3237 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3238 }
3239 dictReleaseIterator(di);
3240 } else if (o->type == REDIS_HASH) {
3241 /* Save a hash value */
3242 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3243 unsigned char *p = zipmapRewind(o->ptr);
3244 unsigned int count = zipmapLen(o->ptr);
3245 unsigned char *key, *val;
3246 unsigned int klen, vlen;
3247
3248 if (rdbSaveLen(fp,count) == -1) return -1;
3249 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3250 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3251 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3252 }
3253 } else {
3254 dictIterator *di = dictGetIterator(o->ptr);
3255 dictEntry *de;
3256
3257 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3258 while((de = dictNext(di)) != NULL) {
3259 robj *key = dictGetEntryKey(de);
3260 robj *val = dictGetEntryVal(de);
3261
3262 if (rdbSaveStringObject(fp,key) == -1) return -1;
3263 if (rdbSaveStringObject(fp,val) == -1) return -1;
3264 }
3265 dictReleaseIterator(di);
3266 }
3267 } else {
3268 redisAssert(0);
3269 }
3270 return 0;
3271 }
3272
3273 /* Return the length the object will have on disk if saved with
3274 * the rdbSaveObject() function. Currently we use a trick to get
3275 * this length with very little changes to the code. In the future
3276 * we could switch to a faster solution. */
3277 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3278 if (fp == NULL) fp = server.devnull;
3279 rewind(fp);
3280 assert(rdbSaveObject(fp,o) != 1);
3281 return ftello(fp);
3282 }
3283
3284 /* Return the number of pages required to save this object in the swap file */
3285 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3286 off_t bytes = rdbSavedObjectLen(o,fp);
3287
3288 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3289 }
3290
3291 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3292 static int rdbSave(char *filename) {
3293 dictIterator *di = NULL;
3294 dictEntry *de;
3295 FILE *fp;
3296 char tmpfile[256];
3297 int j;
3298 time_t now = time(NULL);
3299
3300 /* Wait for I/O therads to terminate, just in case this is a
3301 * foreground-saving, to avoid seeking the swap file descriptor at the
3302 * same time. */
3303 if (server.vm_enabled)
3304 waitEmptyIOJobsQueue();
3305
3306 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3307 fp = fopen(tmpfile,"w");
3308 if (!fp) {
3309 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3310 return REDIS_ERR;
3311 }
3312 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3313 for (j = 0; j < server.dbnum; j++) {
3314 redisDb *db = server.db+j;
3315 dict *d = db->dict;
3316 if (dictSize(d) == 0) continue;
3317 di = dictGetIterator(d);
3318 if (!di) {
3319 fclose(fp);
3320 return REDIS_ERR;
3321 }
3322
3323 /* Write the SELECT DB opcode */
3324 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3325 if (rdbSaveLen(fp,j) == -1) goto werr;
3326
3327 /* Iterate this DB writing every entry */
3328 while((de = dictNext(di)) != NULL) {
3329 robj *key = dictGetEntryKey(de);
3330 robj *o = dictGetEntryVal(de);
3331 time_t expiretime = getExpire(db,key);
3332
3333 /* Save the expire time */
3334 if (expiretime != -1) {
3335 /* If this key is already expired skip it */
3336 if (expiretime < now) continue;
3337 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3338 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3339 }
3340 /* Save the key and associated value. This requires special
3341 * handling if the value is swapped out. */
3342 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3343 key->storage == REDIS_VM_SWAPPING) {
3344 /* Save type, key, value */
3345 if (rdbSaveType(fp,o->type) == -1) goto werr;
3346 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3347 if (rdbSaveObject(fp,o) == -1) goto werr;
3348 } else {
3349 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3350 robj *po;
3351 /* Get a preview of the object in memory */
3352 po = vmPreviewObject(key);
3353 /* Save type, key, value */
3354 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3355 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3356 if (rdbSaveObject(fp,po) == -1) goto werr;
3357 /* Remove the loaded object from memory */
3358 decrRefCount(po);
3359 }
3360 }
3361 dictReleaseIterator(di);
3362 }
3363 /* EOF opcode */
3364 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3365
3366 /* Make sure data will not remain on the OS's output buffers */
3367 fflush(fp);
3368 fsync(fileno(fp));
3369 fclose(fp);
3370
3371 /* Use RENAME to make sure the DB file is changed atomically only
3372 * if the generate DB file is ok. */
3373 if (rename(tmpfile,filename) == -1) {
3374 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3375 unlink(tmpfile);
3376 return REDIS_ERR;
3377 }
3378 redisLog(REDIS_NOTICE,"DB saved on disk");
3379 server.dirty = 0;
3380 server.lastsave = time(NULL);
3381 return REDIS_OK;
3382
3383 werr:
3384 fclose(fp);
3385 unlink(tmpfile);
3386 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3387 if (di) dictReleaseIterator(di);
3388 return REDIS_ERR;
3389 }
3390
3391 static int rdbSaveBackground(char *filename) {
3392 pid_t childpid;
3393
3394 if (server.bgsavechildpid != -1) return REDIS_ERR;
3395 if (server.vm_enabled) waitEmptyIOJobsQueue();
3396 if ((childpid = fork()) == 0) {
3397 /* Child */
3398 if (server.vm_enabled) vmReopenSwapFile();
3399 close(server.fd);
3400 if (rdbSave(filename) == REDIS_OK) {
3401 _exit(0);
3402 } else {
3403 _exit(1);
3404 }
3405 } else {
3406 /* Parent */
3407 if (childpid == -1) {
3408 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3409 strerror(errno));
3410 return REDIS_ERR;
3411 }
3412 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3413 server.bgsavechildpid = childpid;
3414 return REDIS_OK;
3415 }
3416 return REDIS_OK; /* unreached */
3417 }
3418
3419 static void rdbRemoveTempFile(pid_t childpid) {
3420 char tmpfile[256];
3421
3422 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3423 unlink(tmpfile);
3424 }
3425
3426 static int rdbLoadType(FILE *fp) {
3427 unsigned char type;
3428 if (fread(&type,1,1,fp) == 0) return -1;
3429 return type;
3430 }
3431
3432 static time_t rdbLoadTime(FILE *fp) {
3433 int32_t t32;
3434 if (fread(&t32,4,1,fp) == 0) return -1;
3435 return (time_t) t32;
3436 }
3437
3438 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3439 * of this file for a description of how this are stored on disk.
3440 *
3441 * isencoded is set to 1 if the readed length is not actually a length but
3442 * an "encoding type", check the above comments for more info */
3443 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3444 unsigned char buf[2];
3445 uint32_t len;
3446 int type;
3447
3448 if (isencoded) *isencoded = 0;
3449 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3450 type = (buf[0]&0xC0)>>6;
3451 if (type == REDIS_RDB_6BITLEN) {
3452 /* Read a 6 bit len */
3453 return buf[0]&0x3F;
3454 } else if (type == REDIS_RDB_ENCVAL) {
3455 /* Read a 6 bit len encoding type */
3456 if (isencoded) *isencoded = 1;
3457 return buf[0]&0x3F;
3458 } else if (type == REDIS_RDB_14BITLEN) {
3459 /* Read a 14 bit len */
3460 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3461 return ((buf[0]&0x3F)<<8)|buf[1];
3462 } else {
3463 /* Read a 32 bit len */
3464 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3465 return ntohl(len);
3466 }
3467 }
3468
3469 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3470 unsigned char enc[4];
3471 long long val;
3472
3473 if (enctype == REDIS_RDB_ENC_INT8) {
3474 if (fread(enc,1,1,fp) == 0) return NULL;
3475 val = (signed char)enc[0];
3476 } else if (enctype == REDIS_RDB_ENC_INT16) {
3477 uint16_t v;
3478 if (fread(enc,2,1,fp) == 0) return NULL;
3479 v = enc[0]|(enc[1]<<8);
3480 val = (int16_t)v;
3481 } else if (enctype == REDIS_RDB_ENC_INT32) {
3482 uint32_t v;
3483 if (fread(enc,4,1,fp) == 0) return NULL;
3484 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3485 val = (int32_t)v;
3486 } else {
3487 val = 0; /* anti-warning */
3488 redisAssert(0);
3489 }
3490 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3491 }
3492
3493 static robj *rdbLoadLzfStringObject(FILE*fp) {
3494 unsigned int len, clen;
3495 unsigned char *c = NULL;
3496 sds val = NULL;
3497
3498 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3499 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3500 if ((c = zmalloc(clen)) == NULL) goto err;
3501 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3502 if (fread(c,clen,1,fp) == 0) goto err;
3503 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3504 zfree(c);
3505 return createObject(REDIS_STRING,val);
3506 err:
3507 zfree(c);
3508 sdsfree(val);
3509 return NULL;
3510 }
3511
3512 static robj *rdbLoadStringObject(FILE*fp) {
3513 int isencoded;
3514 uint32_t len;
3515 sds val;
3516
3517 len = rdbLoadLen(fp,&isencoded);
3518 if (isencoded) {
3519 switch(len) {
3520 case REDIS_RDB_ENC_INT8:
3521 case REDIS_RDB_ENC_INT16:
3522 case REDIS_RDB_ENC_INT32:
3523 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3524 case REDIS_RDB_ENC_LZF:
3525 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3526 default:
3527 redisAssert(0);
3528 }
3529 }
3530
3531 if (len == REDIS_RDB_LENERR) return NULL;
3532 val = sdsnewlen(NULL,len);
3533 if (len && fread(val,len,1,fp) == 0) {
3534 sdsfree(val);
3535 return NULL;
3536 }
3537 return tryObjectSharing(createObject(REDIS_STRING,val));
3538 }
3539
3540 /* For information about double serialization check rdbSaveDoubleValue() */
3541 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3542 char buf[128];
3543 unsigned char len;
3544
3545 if (fread(&len,1,1,fp) == 0) return -1;
3546 switch(len) {
3547 case 255: *val = R_NegInf; return 0;
3548 case 254: *val = R_PosInf; return 0;
3549 case 253: *val = R_Nan; return 0;
3550 default:
3551 if (fread(buf,len,1,fp) == 0) return -1;
3552 buf[len] = '\0';
3553 sscanf(buf, "%lg", val);
3554 return 0;
3555 }
3556 }
3557
3558 /* Load a Redis object of the specified type from the specified file.
3559 * On success a newly allocated object is returned, otherwise NULL. */
3560 static robj *rdbLoadObject(int type, FILE *fp) {
3561 robj *o;
3562
3563 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3564 if (type == REDIS_STRING) {
3565 /* Read string value */
3566 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3567 tryObjectEncoding(o);
3568 } else if (type == REDIS_LIST || type == REDIS_SET) {
3569 /* Read list/set value */
3570 uint32_t listlen;
3571
3572 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3573 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3574 /* It's faster to expand the dict to the right size asap in order
3575 * to avoid rehashing */
3576 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3577 dictExpand(o->ptr,listlen);
3578 /* Load every single element of the list/set */
3579 while(listlen--) {
3580 robj *ele;
3581
3582 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3583 tryObjectEncoding(ele);
3584 if (type == REDIS_LIST) {
3585 listAddNodeTail((list*)o->ptr,ele);
3586 } else {
3587 dictAdd((dict*)o->ptr,ele,NULL);
3588 }
3589 }
3590 } else if (type == REDIS_ZSET) {
3591 /* Read list/set value */
3592 size_t zsetlen;
3593 zset *zs;
3594
3595 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3596 o = createZsetObject();
3597 zs = o->ptr;
3598 /* Load every single element of the list/set */
3599 while(zsetlen--) {
3600 robj *ele;
3601 double *score = zmalloc(sizeof(double));
3602
3603 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3604 tryObjectEncoding(ele);
3605 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3606 dictAdd(zs->dict,ele,score);
3607 zslInsert(zs->zsl,*score,ele);
3608 incrRefCount(ele); /* added to skiplist */
3609 }
3610 } else if (type == REDIS_HASH) {
3611 size_t hashlen;
3612
3613 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3614 o = createHashObject();
3615 /* Too many entries? Use an hash table. */
3616 if (hashlen > server.hash_max_zipmap_entries)
3617 convertToRealHash(o);
3618 /* Load every key/value, then set it into the zipmap or hash
3619 * table, as needed. */
3620 while(hashlen--) {
3621 robj *key, *val;
3622
3623 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3624 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3625 /* If we are using a zipmap and there are too big values
3626 * the object is converted to real hash table encoding. */
3627 if (o->encoding != REDIS_ENCODING_HT &&
3628 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3629 sdslen(val->ptr) > server.hash_max_zipmap_value))
3630 {
3631 convertToRealHash(o);
3632 }
3633
3634 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3635 unsigned char *zm = o->ptr;
3636
3637 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3638 val->ptr,sdslen(val->ptr),NULL);
3639 o->ptr = zm;
3640 decrRefCount(key);
3641 decrRefCount(val);
3642 } else {
3643 tryObjectEncoding(key);
3644 tryObjectEncoding(val);
3645 dictAdd((dict*)o->ptr,key,val);
3646 }
3647 }
3648 } else {
3649 redisAssert(0);
3650 }
3651 return o;
3652 }
3653
3654 static int rdbLoad(char *filename) {
3655 FILE *fp;
3656 robj *keyobj = NULL;
3657 uint32_t dbid;
3658 int type, retval, rdbver;
3659 dict *d = server.db[0].dict;
3660 redisDb *db = server.db+0;
3661 char buf[1024];
3662 time_t expiretime = -1, now = time(NULL);
3663 long long loadedkeys = 0;
3664
3665 fp = fopen(filename,"r");
3666 if (!fp) return REDIS_ERR;
3667 if (fread(buf,9,1,fp) == 0) goto eoferr;
3668 buf[9] = '\0';
3669 if (memcmp(buf,"REDIS",5) != 0) {
3670 fclose(fp);
3671 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3672 return REDIS_ERR;
3673 }
3674 rdbver = atoi(buf+5);
3675 if (rdbver != 1) {
3676 fclose(fp);
3677 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3678 return REDIS_ERR;
3679 }
3680 while(1) {
3681 robj *o;
3682
3683 /* Read type. */
3684 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3685 if (type == REDIS_EXPIRETIME) {
3686 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3687 /* We read the time so we need to read the object type again */
3688 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3689 }
3690 if (type == REDIS_EOF) break;
3691 /* Handle SELECT DB opcode as a special case */
3692 if (type == REDIS_SELECTDB) {
3693 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3694 goto eoferr;
3695 if (dbid >= (unsigned)server.dbnum) {
3696 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3697 exit(1);
3698 }
3699 db = server.db+dbid;
3700 d = db->dict;
3701 continue;
3702 }
3703 /* Read key */
3704 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3705 /* Read value */
3706 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3707 /* Add the new object in the hash table */
3708 retval = dictAdd(d,keyobj,o);
3709 if (retval == DICT_ERR) {
3710 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3711 exit(1);
3712 }
3713 /* Set the expire time if needed */
3714 if (expiretime != -1) {
3715 setExpire(db,keyobj,expiretime);
3716 /* Delete this key if already expired */
3717 if (expiretime < now) deleteKey(db,keyobj);
3718 expiretime = -1;
3719 }
3720 keyobj = o = NULL;
3721 /* Handle swapping while loading big datasets when VM is on */
3722 loadedkeys++;
3723 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3724 while (zmalloc_used_memory() > server.vm_max_memory) {
3725 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3726 }
3727 }
3728 }
3729 fclose(fp);
3730 return REDIS_OK;
3731
3732 eoferr: /* unexpected end of file is handled here with a fatal exit */
3733 if (keyobj) decrRefCount(keyobj);
3734 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3735 exit(1);
3736 return REDIS_ERR; /* Just to avoid warning */
3737 }
3738
3739 /*================================== Commands =============================== */
3740
3741 static void authCommand(redisClient *c) {
3742 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3743 c->authenticated = 1;
3744 addReply(c,shared.ok);
3745 } else {
3746 c->authenticated = 0;
3747 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3748 }
3749 }
3750
3751 static void pingCommand(redisClient *c) {
3752 addReply(c,shared.pong);
3753 }
3754
3755 static void echoCommand(redisClient *c) {
3756 addReplyBulk(c,c->argv[1]);
3757 }
3758
3759 /*=================================== Strings =============================== */
3760
3761 static void setGenericCommand(redisClient *c, int nx) {
3762 int retval;
3763
3764 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3765 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3766 if (retval == DICT_ERR) {
3767 if (!nx) {
3768 /* If the key is about a swapped value, we want a new key object
3769 * to overwrite the old. So we delete the old key in the database.
3770 * This will also make sure that swap pages about the old object
3771 * will be marked as free. */
3772 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3773 incrRefCount(c->argv[1]);
3774 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3775 incrRefCount(c->argv[2]);
3776 } else {
3777 addReply(c,shared.czero);
3778 return;
3779 }
3780 } else {
3781 incrRefCount(c->argv[1]);
3782 incrRefCount(c->argv[2]);
3783 }
3784 server.dirty++;
3785 removeExpire(c->db,c->argv[1]);
3786 addReply(c, nx ? shared.cone : shared.ok);
3787 }
3788
3789 static void setCommand(redisClient *c) {
3790 setGenericCommand(c,0);
3791 }
3792
3793 static void setnxCommand(redisClient *c) {
3794 setGenericCommand(c,1);
3795 }
3796
3797 static int getGenericCommand(redisClient *c) {
3798 robj *o;
3799
3800 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3801 return REDIS_OK;
3802
3803 if (o->type != REDIS_STRING) {
3804 addReply(c,shared.wrongtypeerr);
3805 return REDIS_ERR;
3806 } else {
3807 addReplyBulk(c,o);
3808 return REDIS_OK;
3809 }
3810 }
3811
3812 static void getCommand(redisClient *c) {
3813 getGenericCommand(c);
3814 }
3815
3816 static void getsetCommand(redisClient *c) {
3817 if (getGenericCommand(c) == REDIS_ERR) return;
3818 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3819 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3820 } else {
3821 incrRefCount(c->argv[1]);
3822 }
3823 incrRefCount(c->argv[2]);
3824 server.dirty++;
3825 removeExpire(c->db,c->argv[1]);
3826 }
3827
3828 static void mgetCommand(redisClient *c) {
3829 int j;
3830
3831 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3832 for (j = 1; j < c->argc; j++) {
3833 robj *o = lookupKeyRead(c->db,c->argv[j]);
3834 if (o == NULL) {
3835 addReply(c,shared.nullbulk);
3836 } else {
3837 if (o->type != REDIS_STRING) {
3838 addReply(c,shared.nullbulk);
3839 } else {
3840 addReplyBulk(c,o);
3841 }
3842 }
3843 }
3844 }
3845
3846 static void msetGenericCommand(redisClient *c, int nx) {
3847 int j, busykeys = 0;
3848
3849 if ((c->argc % 2) == 0) {
3850 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3851 return;
3852 }
3853 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3854 * set nothing at all if at least one already key exists. */
3855 if (nx) {
3856 for (j = 1; j < c->argc; j += 2) {
3857 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3858 busykeys++;
3859 }
3860 }
3861 }
3862 if (busykeys) {
3863 addReply(c, shared.czero);
3864 return;
3865 }
3866
3867 for (j = 1; j < c->argc; j += 2) {
3868 int retval;
3869
3870 tryObjectEncoding(c->argv[j+1]);
3871 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3872 if (retval == DICT_ERR) {
3873 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3874 incrRefCount(c->argv[j+1]);
3875 } else {
3876 incrRefCount(c->argv[j]);
3877 incrRefCount(c->argv[j+1]);
3878 }
3879 removeExpire(c->db,c->argv[j]);
3880 }
3881 server.dirty += (c->argc-1)/2;
3882 addReply(c, nx ? shared.cone : shared.ok);
3883 }
3884
3885 static void msetCommand(redisClient *c) {
3886 msetGenericCommand(c,0);
3887 }
3888
3889 static void msetnxCommand(redisClient *c) {
3890 msetGenericCommand(c,1);
3891 }
3892
3893 static void incrDecrCommand(redisClient *c, long long incr) {
3894 long long value;
3895 int retval;
3896 robj *o;
3897
3898 o = lookupKeyWrite(c->db,c->argv[1]);
3899 if (o == NULL) {
3900 value = 0;
3901 } else {
3902 if (o->type != REDIS_STRING) {
3903 value = 0;
3904 } else {
3905 char *eptr;
3906
3907 if (o->encoding == REDIS_ENCODING_RAW)
3908 value = strtoll(o->ptr, &eptr, 10);
3909 else if (o->encoding == REDIS_ENCODING_INT)
3910 value = (long)o->ptr;
3911 else
3912 redisAssert(1 != 1);
3913 }
3914 }
3915
3916 value += incr;
3917 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3918 tryObjectEncoding(o);
3919 retval = dictAdd(c->db->dict,c->argv[1],o);
3920 if (retval == DICT_ERR) {
3921 dictReplace(c->db->dict,c->argv[1],o);
3922 removeExpire(c->db,c->argv[1]);
3923 } else {
3924 incrRefCount(c->argv[1]);
3925 }
3926 server.dirty++;
3927 addReply(c,shared.colon);
3928 addReply(c,o);
3929 addReply(c,shared.crlf);
3930 }
3931
3932 static void incrCommand(redisClient *c) {
3933 incrDecrCommand(c,1);
3934 }
3935
3936 static void decrCommand(redisClient *c) {
3937 incrDecrCommand(c,-1);
3938 }
3939
3940 static void incrbyCommand(redisClient *c) {
3941 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3942 incrDecrCommand(c,incr);
3943 }
3944
3945 static void decrbyCommand(redisClient *c) {
3946 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3947 incrDecrCommand(c,-incr);
3948 }
3949
3950 static void appendCommand(redisClient *c) {
3951 int retval;
3952 size_t totlen;
3953 robj *o;
3954
3955 o = lookupKeyWrite(c->db,c->argv[1]);
3956 if (o == NULL) {
3957 /* Create the key */
3958 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3959 incrRefCount(c->argv[1]);
3960 incrRefCount(c->argv[2]);
3961 totlen = stringObjectLen(c->argv[2]);
3962 } else {
3963 dictEntry *de;
3964
3965 de = dictFind(c->db->dict,c->argv[1]);
3966 assert(de != NULL);
3967
3968 o = dictGetEntryVal(de);
3969 if (o->type != REDIS_STRING) {
3970 addReply(c,shared.wrongtypeerr);
3971 return;
3972 }
3973 /* If the object is specially encoded or shared we have to make
3974 * a copy */
3975 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3976 robj *decoded = getDecodedObject(o);
3977
3978 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3979 decrRefCount(decoded);
3980 dictReplace(c->db->dict,c->argv[1],o);
3981 }
3982 /* APPEND! */
3983 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3984 o->ptr = sdscatlen(o->ptr,
3985 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3986 } else {
3987 o->ptr = sdscatprintf(o->ptr, "%ld",
3988 (unsigned long) c->argv[2]->ptr);
3989 }
3990 totlen = sdslen(o->ptr);
3991 }
3992 server.dirty++;
3993 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3994 }
3995
3996 static void substrCommand(redisClient *c) {
3997 robj *o;
3998 long start = atoi(c->argv[2]->ptr);
3999 long end = atoi(c->argv[3]->ptr);
4000 size_t rangelen, strlen;
4001 sds range;
4002
4003 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4004 checkType(c,o,REDIS_STRING)) return;
4005
4006 o = getDecodedObject(o);
4007 strlen = sdslen(o->ptr);
4008
4009 /* convert negative indexes */
4010 if (start < 0) start = strlen+start;
4011 if (end < 0) end = strlen+end;
4012 if (start < 0) start = 0;
4013 if (end < 0) end = 0;
4014
4015 /* indexes sanity checks */
4016 if (start > end || (size_t)start >= strlen) {
4017 /* Out of range start or start > end result in null reply */
4018 addReply(c,shared.nullbulk);
4019 decrRefCount(o);
4020 return;
4021 }
4022 if ((size_t)end >= strlen) end = strlen-1;
4023 rangelen = (end-start)+1;
4024
4025 /* Return the result */
4026 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4027 range = sdsnewlen((char*)o->ptr+start,rangelen);
4028 addReplySds(c,range);
4029 addReply(c,shared.crlf);
4030 decrRefCount(o);
4031 }
4032
4033 /* ========================= Type agnostic commands ========================= */
4034
4035 static void delCommand(redisClient *c) {
4036 int deleted = 0, j;
4037
4038 for (j = 1; j < c->argc; j++) {
4039 if (deleteKey(c->db,c->argv[j])) {
4040 server.dirty++;
4041 deleted++;
4042 }
4043 }
4044 addReplyLong(c,deleted);
4045 }
4046
4047 static void existsCommand(redisClient *c) {
4048 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4049 }
4050
4051 static void selectCommand(redisClient *c) {
4052 int id = atoi(c->argv[1]->ptr);
4053
4054 if (selectDb(c,id) == REDIS_ERR) {
4055 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4056 } else {
4057 addReply(c,shared.ok);
4058 }
4059 }
4060
4061 static void randomkeyCommand(redisClient *c) {
4062 dictEntry *de;
4063
4064 while(1) {
4065 de = dictGetRandomKey(c->db->dict);
4066 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4067 }
4068 if (de == NULL) {
4069 addReply(c,shared.plus);
4070 addReply(c,shared.crlf);
4071 } else {
4072 addReply(c,shared.plus);
4073 addReply(c,dictGetEntryKey(de));
4074 addReply(c,shared.crlf);
4075 }
4076 }
4077
4078 static void keysCommand(redisClient *c) {
4079 dictIterator *di;
4080 dictEntry *de;
4081 sds pattern = c->argv[1]->ptr;
4082 int plen = sdslen(pattern);
4083 unsigned long numkeys = 0;
4084 robj *lenobj = createObject(REDIS_STRING,NULL);
4085
4086 di = dictGetIterator(c->db->dict);
4087 addReply(c,lenobj);
4088 decrRefCount(lenobj);
4089 while((de = dictNext(di)) != NULL) {
4090 robj *keyobj = dictGetEntryKey(de);
4091
4092 sds key = keyobj->ptr;
4093 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4094 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4095 if (expireIfNeeded(c->db,keyobj) == 0) {
4096 addReplyBulk(c,keyobj);
4097 numkeys++;
4098 }
4099 }
4100 }
4101 dictReleaseIterator(di);
4102 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4103 }
4104
4105 static void dbsizeCommand(redisClient *c) {
4106 addReplySds(c,
4107 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4108 }
4109
4110 static void lastsaveCommand(redisClient *c) {
4111 addReplySds(c,
4112 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4113 }
4114
4115 static void typeCommand(redisClient *c) {
4116 robj *o;
4117 char *type;
4118
4119 o = lookupKeyRead(c->db,c->argv[1]);
4120 if (o == NULL) {
4121 type = "+none";
4122 } else {
4123 switch(o->type) {
4124 case REDIS_STRING: type = "+string"; break;
4125 case REDIS_LIST: type = "+list"; break;
4126 case REDIS_SET: type = "+set"; break;
4127 case REDIS_ZSET: type = "+zset"; break;
4128 case REDIS_HASH: type = "+hash"; break;
4129 default: type = "+unknown"; break;
4130 }
4131 }
4132 addReplySds(c,sdsnew(type));
4133 addReply(c,shared.crlf);
4134 }
4135
4136 static void saveCommand(redisClient *c) {
4137 if (server.bgsavechildpid != -1) {
4138 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4139 return;
4140 }
4141 if (rdbSave(server.dbfilename) == REDIS_OK) {
4142 addReply(c,shared.ok);
4143 } else {
4144 addReply(c,shared.err);
4145 }
4146 }
4147
4148 static void bgsaveCommand(redisClient *c) {
4149 if (server.bgsavechildpid != -1) {
4150 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4151 return;
4152 }
4153 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4154 char *status = "+Background saving started\r\n";
4155 addReplySds(c,sdsnew(status));
4156 } else {
4157 addReply(c,shared.err);
4158 }
4159 }
4160
4161 static void shutdownCommand(redisClient *c) {
4162 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4163 /* Kill the saving child if there is a background saving in progress.
4164 We want to avoid race conditions, for instance our saving child may
4165 overwrite the synchronous saving did by SHUTDOWN. */
4166 if (server.bgsavechildpid != -1) {
4167 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4168 kill(server.bgsavechildpid,SIGKILL);
4169 rdbRemoveTempFile(server.bgsavechildpid);
4170 }
4171 if (server.appendonly) {
4172 /* Append only file: fsync() the AOF and exit */
4173 fsync(server.appendfd);
4174 if (server.vm_enabled) unlink(server.vm_swap_file);
4175 exit(0);
4176 } else {
4177 /* Snapshotting. Perform a SYNC SAVE and exit */
4178 if (rdbSave(server.dbfilename) == REDIS_OK) {
4179 if (server.daemonize)
4180 unlink(server.pidfile);
4181 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4182 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4183 if (server.vm_enabled) unlink(server.vm_swap_file);
4184 exit(0);
4185 } else {
4186 /* Ooops.. error saving! The best we can do is to continue
4187 * operating. Note that if there was a background saving process,
4188 * in the next cron() Redis will be notified that the background
4189 * saving aborted, handling special stuff like slaves pending for
4190 * synchronization... */
4191 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4192 addReplySds(c,
4193 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4194 }
4195 }
4196 }
4197
4198 static void renameGenericCommand(redisClient *c, int nx) {
4199 robj *o;
4200
4201 /* To use the same key as src and dst is probably an error */
4202 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4203 addReply(c,shared.sameobjecterr);
4204 return;
4205 }
4206
4207 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4208 return;
4209
4210 incrRefCount(o);
4211 deleteIfVolatile(c->db,c->argv[2]);
4212 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4213 if (nx) {
4214 decrRefCount(o);
4215 addReply(c,shared.czero);
4216 return;
4217 }
4218 dictReplace(c->db->dict,c->argv[2],o);
4219 } else {
4220 incrRefCount(c->argv[2]);
4221 }
4222 deleteKey(c->db,c->argv[1]);
4223 server.dirty++;
4224 addReply(c,nx ? shared.cone : shared.ok);
4225 }
4226
4227 static void renameCommand(redisClient *c) {
4228 renameGenericCommand(c,0);
4229 }
4230
4231 static void renamenxCommand(redisClient *c) {
4232 renameGenericCommand(c,1);
4233 }
4234
4235 static void moveCommand(redisClient *c) {
4236 robj *o;
4237 redisDb *src, *dst;
4238 int srcid;
4239
4240 /* Obtain source and target DB pointers */
4241 src = c->db;
4242 srcid = c->db->id;
4243 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4244 addReply(c,shared.outofrangeerr);
4245 return;
4246 }
4247 dst = c->db;
4248 selectDb(c,srcid); /* Back to the source DB */
4249
4250 /* If the user is moving using as target the same
4251 * DB as the source DB it is probably an error. */
4252 if (src == dst) {
4253 addReply(c,shared.sameobjecterr);
4254 return;
4255 }
4256
4257 /* Check if the element exists and get a reference */
4258 o = lookupKeyWrite(c->db,c->argv[1]);
4259 if (!o) {
4260 addReply(c,shared.czero);
4261 return;
4262 }
4263
4264 /* Try to add the element to the target DB */
4265 deleteIfVolatile(dst,c->argv[1]);
4266 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4267 addReply(c,shared.czero);
4268 return;
4269 }
4270 incrRefCount(c->argv[1]);
4271 incrRefCount(o);
4272
4273 /* OK! key moved, free the entry in the source DB */
4274 deleteKey(src,c->argv[1]);
4275 server.dirty++;
4276 addReply(c,shared.cone);
4277 }
4278
4279 /* =================================== Lists ================================ */
4280 static void pushGenericCommand(redisClient *c, int where) {
4281 robj *lobj;
4282 list *list;
4283
4284 lobj = lookupKeyWrite(c->db,c->argv[1]);
4285 if (lobj == NULL) {
4286 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4287 addReply(c,shared.cone);
4288 return;
4289 }
4290 lobj = createListObject();
4291 list = lobj->ptr;
4292 if (where == REDIS_HEAD) {
4293 listAddNodeHead(list,c->argv[2]);
4294 } else {
4295 listAddNodeTail(list,c->argv[2]);
4296 }
4297 dictAdd(c->db->dict,c->argv[1],lobj);
4298 incrRefCount(c->argv[1]);
4299 incrRefCount(c->argv[2]);
4300 } else {
4301 if (lobj->type != REDIS_LIST) {
4302 addReply(c,shared.wrongtypeerr);
4303 return;
4304 }
4305 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4306 addReply(c,shared.cone);
4307 return;
4308 }
4309 list = lobj->ptr;
4310 if (where == REDIS_HEAD) {
4311 listAddNodeHead(list,c->argv[2]);
4312 } else {
4313 listAddNodeTail(list,c->argv[2]);
4314 }
4315 incrRefCount(c->argv[2]);
4316 }
4317 server.dirty++;
4318 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4319 }
4320
4321 static void lpushCommand(redisClient *c) {
4322 pushGenericCommand(c,REDIS_HEAD);
4323 }
4324
4325 static void rpushCommand(redisClient *c) {
4326 pushGenericCommand(c,REDIS_TAIL);
4327 }
4328
4329 static void llenCommand(redisClient *c) {
4330 robj *o;
4331 list *l;
4332
4333 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4334 checkType(c,o,REDIS_LIST)) return;
4335
4336 l = o->ptr;
4337 addReplyUlong(c,listLength(l));
4338 }
4339
4340 static void lindexCommand(redisClient *c) {
4341 robj *o;
4342 int index = atoi(c->argv[2]->ptr);
4343 list *list;
4344 listNode *ln;
4345
4346 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4347 checkType(c,o,REDIS_LIST)) return;
4348 list = o->ptr;
4349
4350 ln = listIndex(list, index);
4351 if (ln == NULL) {
4352 addReply(c,shared.nullbulk);
4353 } else {
4354 robj *ele = listNodeValue(ln);
4355 addReplyBulk(c,ele);
4356 }
4357 }
4358
4359 static void lsetCommand(redisClient *c) {
4360 robj *o;
4361 int index = atoi(c->argv[2]->ptr);
4362 list *list;
4363 listNode *ln;
4364
4365 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4366 checkType(c,o,REDIS_LIST)) return;
4367 list = o->ptr;
4368
4369 ln = listIndex(list, index);
4370 if (ln == NULL) {
4371 addReply(c,shared.outofrangeerr);
4372 } else {
4373 robj *ele = listNodeValue(ln);
4374
4375 decrRefCount(ele);
4376 listNodeValue(ln) = c->argv[3];
4377 incrRefCount(c->argv[3]);
4378 addReply(c,shared.ok);
4379 server.dirty++;
4380 }
4381 }
4382
4383 static void popGenericCommand(redisClient *c, int where) {
4384 robj *o;
4385 list *list;
4386 listNode *ln;
4387
4388 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4389 checkType(c,o,REDIS_LIST)) return;
4390 list = o->ptr;
4391
4392 if (where == REDIS_HEAD)
4393 ln = listFirst(list);
4394 else
4395 ln = listLast(list);
4396
4397 if (ln == NULL) {
4398 addReply(c,shared.nullbulk);
4399 } else {
4400 robj *ele = listNodeValue(ln);
4401 addReplyBulk(c,ele);
4402 listDelNode(list,ln);
4403 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4404 server.dirty++;
4405 }
4406 }
4407
4408 static void lpopCommand(redisClient *c) {
4409 popGenericCommand(c,REDIS_HEAD);
4410 }
4411
4412 static void rpopCommand(redisClient *c) {
4413 popGenericCommand(c,REDIS_TAIL);
4414 }
4415
4416 static void lrangeCommand(redisClient *c) {
4417 robj *o;
4418 int start = atoi(c->argv[2]->ptr);
4419 int end = atoi(c->argv[3]->ptr);
4420 int llen;
4421 int rangelen, j;
4422 list *list;
4423 listNode *ln;
4424 robj *ele;
4425
4426 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4427 checkType(c,o,REDIS_LIST)) return;
4428 list = o->ptr;
4429 llen = listLength(list);
4430
4431 /* convert negative indexes */
4432 if (start < 0) start = llen+start;
4433 if (end < 0) end = llen+end;
4434 if (start < 0) start = 0;
4435 if (end < 0) end = 0;
4436
4437 /* indexes sanity checks */
4438 if (start > end || start >= llen) {
4439 /* Out of range start or start > end result in empty list */
4440 addReply(c,shared.emptymultibulk);
4441 return;
4442 }
4443 if (end >= llen) end = llen-1;
4444 rangelen = (end-start)+1;
4445
4446 /* Return the result in form of a multi-bulk reply */
4447 ln = listIndex(list, start);
4448 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4449 for (j = 0; j < rangelen; j++) {
4450 ele = listNodeValue(ln);
4451 addReplyBulk(c,ele);
4452 ln = ln->next;
4453 }
4454 }
4455
4456 static void ltrimCommand(redisClient *c) {
4457 robj *o;
4458 int start = atoi(c->argv[2]->ptr);
4459 int end = atoi(c->argv[3]->ptr);
4460 int llen;
4461 int j, ltrim, rtrim;
4462 list *list;
4463 listNode *ln;
4464
4465 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4466 checkType(c,o,REDIS_LIST)) return;
4467 list = o->ptr;
4468 llen = listLength(list);
4469
4470 /* convert negative indexes */
4471 if (start < 0) start = llen+start;
4472 if (end < 0) end = llen+end;
4473 if (start < 0) start = 0;
4474 if (end < 0) end = 0;
4475
4476 /* indexes sanity checks */
4477 if (start > end || start >= llen) {
4478 /* Out of range start or start > end result in empty list */
4479 ltrim = llen;
4480 rtrim = 0;
4481 } else {
4482 if (end >= llen) end = llen-1;
4483 ltrim = start;
4484 rtrim = llen-end-1;
4485 }
4486
4487 /* Remove list elements to perform the trim */
4488 for (j = 0; j < ltrim; j++) {
4489 ln = listFirst(list);
4490 listDelNode(list,ln);
4491 }
4492 for (j = 0; j < rtrim; j++) {
4493 ln = listLast(list);
4494 listDelNode(list,ln);
4495 }
4496 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4497 server.dirty++;
4498 addReply(c,shared.ok);
4499 }
4500
4501 static void lremCommand(redisClient *c) {
4502 robj *o;
4503 list *list;
4504 listNode *ln, *next;
4505 int toremove = atoi(c->argv[2]->ptr);
4506 int removed = 0;
4507 int fromtail = 0;
4508
4509 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4510 checkType(c,o,REDIS_LIST)) return;
4511 list = o->ptr;
4512
4513 if (toremove < 0) {
4514 toremove = -toremove;
4515 fromtail = 1;
4516 }
4517 ln = fromtail ? list->tail : list->head;
4518 while (ln) {
4519 robj *ele = listNodeValue(ln);
4520
4521 next = fromtail ? ln->prev : ln->next;
4522 if (compareStringObjects(ele,c->argv[3]) == 0) {
4523 listDelNode(list,ln);
4524 server.dirty++;
4525 removed++;
4526 if (toremove && removed == toremove) break;
4527 }
4528 ln = next;
4529 }
4530 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4531 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4532 }
4533
4534 /* This is the semantic of this command:
4535 * RPOPLPUSH srclist dstlist:
4536 * IF LLEN(srclist) > 0
4537 * element = RPOP srclist
4538 * LPUSH dstlist element
4539 * RETURN element
4540 * ELSE
4541 * RETURN nil
4542 * END
4543 * END
4544 *
4545 * The idea is to be able to get an element from a list in a reliable way
4546 * since the element is not just returned but pushed against another list
4547 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4548 */
4549 static void rpoplpushcommand(redisClient *c) {
4550 robj *sobj;
4551 list *srclist;
4552 listNode *ln;
4553
4554 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4555 checkType(c,sobj,REDIS_LIST)) return;
4556 srclist = sobj->ptr;
4557 ln = listLast(srclist);
4558
4559 if (ln == NULL) {
4560 addReply(c,shared.nullbulk);
4561 } else {
4562 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4563 robj *ele = listNodeValue(ln);
4564 list *dstlist;
4565
4566 if (dobj && dobj->type != REDIS_LIST) {
4567 addReply(c,shared.wrongtypeerr);
4568 return;
4569 }
4570
4571 /* Add the element to the target list (unless it's directly
4572 * passed to some BLPOP-ing client */
4573 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4574 if (dobj == NULL) {
4575 /* Create the list if the key does not exist */
4576 dobj = createListObject();
4577 dictAdd(c->db->dict,c->argv[2],dobj);
4578 incrRefCount(c->argv[2]);
4579 }
4580 dstlist = dobj->ptr;
4581 listAddNodeHead(dstlist,ele);
4582 incrRefCount(ele);
4583 }
4584
4585 /* Send the element to the client as reply as well */
4586 addReplyBulk(c,ele);
4587
4588 /* Finally remove the element from the source list */
4589 listDelNode(srclist,ln);
4590 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4591 server.dirty++;
4592 }
4593 }
4594
4595 /* ==================================== Sets ================================ */
4596
4597 static void saddCommand(redisClient *c) {
4598 robj *set;
4599
4600 set = lookupKeyWrite(c->db,c->argv[1]);
4601 if (set == NULL) {
4602 set = createSetObject();
4603 dictAdd(c->db->dict,c->argv[1],set);
4604 incrRefCount(c->argv[1]);
4605 } else {
4606 if (set->type != REDIS_SET) {
4607 addReply(c,shared.wrongtypeerr);
4608 return;
4609 }
4610 }
4611 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4612 incrRefCount(c->argv[2]);
4613 server.dirty++;
4614 addReply(c,shared.cone);
4615 } else {
4616 addReply(c,shared.czero);
4617 }
4618 }
4619
4620 static void sremCommand(redisClient *c) {
4621 robj *set;
4622
4623 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4624 checkType(c,set,REDIS_SET)) return;
4625
4626 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4627 server.dirty++;
4628 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4629 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4630 addReply(c,shared.cone);
4631 } else {
4632 addReply(c,shared.czero);
4633 }
4634 }
4635
4636 static void smoveCommand(redisClient *c) {
4637 robj *srcset, *dstset;
4638
4639 srcset = lookupKeyWrite(c->db,c->argv[1]);
4640 dstset = lookupKeyWrite(c->db,c->argv[2]);
4641
4642 /* If the source key does not exist return 0, if it's of the wrong type
4643 * raise an error */
4644 if (srcset == NULL || srcset->type != REDIS_SET) {
4645 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4646 return;
4647 }
4648 /* Error if the destination key is not a set as well */
4649 if (dstset && dstset->type != REDIS_SET) {
4650 addReply(c,shared.wrongtypeerr);
4651 return;
4652 }
4653 /* Remove the element from the source set */
4654 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4655 /* Key not found in the src set! return zero */
4656 addReply(c,shared.czero);
4657 return;
4658 }
4659 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4660 deleteKey(c->db,c->argv[1]);
4661 server.dirty++;
4662 /* Add the element to the destination set */
4663 if (!dstset) {
4664 dstset = createSetObject();
4665 dictAdd(c->db->dict,c->argv[2],dstset);
4666 incrRefCount(c->argv[2]);
4667 }
4668 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4669 incrRefCount(c->argv[3]);
4670 addReply(c,shared.cone);
4671 }
4672
4673 static void sismemberCommand(redisClient *c) {
4674 robj *set;
4675
4676 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4677 checkType(c,set,REDIS_SET)) return;
4678
4679 if (dictFind(set->ptr,c->argv[2]))
4680 addReply(c,shared.cone);
4681 else
4682 addReply(c,shared.czero);
4683 }
4684
4685 static void scardCommand(redisClient *c) {
4686 robj *o;
4687 dict *s;
4688
4689 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4690 checkType(c,o,REDIS_SET)) return;
4691
4692 s = o->ptr;
4693 addReplyUlong(c,dictSize(s));
4694 }
4695
4696 static void spopCommand(redisClient *c) {
4697 robj *set;
4698 dictEntry *de;
4699
4700 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4701 checkType(c,set,REDIS_SET)) return;
4702
4703 de = dictGetRandomKey(set->ptr);
4704 if (de == NULL) {
4705 addReply(c,shared.nullbulk);
4706 } else {
4707 robj *ele = dictGetEntryKey(de);
4708
4709 addReplyBulk(c,ele);
4710 dictDelete(set->ptr,ele);
4711 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4712 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4713 server.dirty++;
4714 }
4715 }
4716
4717 static void srandmemberCommand(redisClient *c) {
4718 robj *set;
4719 dictEntry *de;
4720
4721 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4722 checkType(c,set,REDIS_SET)) return;
4723
4724 de = dictGetRandomKey(set->ptr);
4725 if (de == NULL) {
4726 addReply(c,shared.nullbulk);
4727 } else {
4728 robj *ele = dictGetEntryKey(de);
4729
4730 addReplyBulk(c,ele);
4731 }
4732 }
4733
4734 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4735 dict **d1 = (void*) s1, **d2 = (void*) s2;
4736
4737 return dictSize(*d1)-dictSize(*d2);
4738 }
4739
4740 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4741 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4742 dictIterator *di;
4743 dictEntry *de;
4744 robj *lenobj = NULL, *dstset = NULL;
4745 unsigned long j, cardinality = 0;
4746
4747 for (j = 0; j < setsnum; j++) {
4748 robj *setobj;
4749
4750 setobj = dstkey ?
4751 lookupKeyWrite(c->db,setskeys[j]) :
4752 lookupKeyRead(c->db,setskeys[j]);
4753 if (!setobj) {
4754 zfree(dv);
4755 if (dstkey) {
4756 if (deleteKey(c->db,dstkey))
4757 server.dirty++;
4758 addReply(c,shared.czero);
4759 } else {
4760 addReply(c,shared.nullmultibulk);
4761 }
4762 return;
4763 }
4764 if (setobj->type != REDIS_SET) {
4765 zfree(dv);
4766 addReply(c,shared.wrongtypeerr);
4767 return;
4768 }
4769 dv[j] = setobj->ptr;
4770 }
4771 /* Sort sets from the smallest to largest, this will improve our
4772 * algorithm's performace */
4773 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4774
4775 /* The first thing we should output is the total number of elements...
4776 * since this is a multi-bulk write, but at this stage we don't know
4777 * the intersection set size, so we use a trick, append an empty object
4778 * to the output list and save the pointer to later modify it with the
4779 * right length */
4780 if (!dstkey) {
4781 lenobj = createObject(REDIS_STRING,NULL);
4782 addReply(c,lenobj);
4783 decrRefCount(lenobj);
4784 } else {
4785 /* If we have a target key where to store the resulting set
4786 * create this key with an empty set inside */
4787 dstset = createSetObject();
4788 }
4789
4790 /* Iterate all the elements of the first (smallest) set, and test
4791 * the element against all the other sets, if at least one set does
4792 * not include the element it is discarded */
4793 di = dictGetIterator(dv[0]);
4794
4795 while((de = dictNext(di)) != NULL) {
4796 robj *ele;
4797
4798 for (j = 1; j < setsnum; j++)
4799 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4800 if (j != setsnum)
4801 continue; /* at least one set does not contain the member */
4802 ele = dictGetEntryKey(de);
4803 if (!dstkey) {
4804 addReplyBulk(c,ele);
4805 cardinality++;
4806 } else {
4807 dictAdd(dstset->ptr,ele,NULL);
4808 incrRefCount(ele);
4809 }
4810 }
4811 dictReleaseIterator(di);
4812
4813 if (dstkey) {
4814 /* Store the resulting set into the target, if the intersection
4815 * is not an empty set. */
4816 deleteKey(c->db,dstkey);
4817 if (dictSize((dict*)dstset->ptr) > 0) {
4818 dictAdd(c->db->dict,dstkey,dstset);
4819 incrRefCount(dstkey);
4820 addReplyLong(c,dictSize((dict*)dstset->ptr));
4821 } else {
4822 decrRefCount(dstset);
4823 addReply(c,shared.czero);
4824 }
4825 server.dirty++;
4826 } else {
4827 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4828 }
4829 zfree(dv);
4830 }
4831
4832 static void sinterCommand(redisClient *c) {
4833 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4834 }
4835
4836 static void sinterstoreCommand(redisClient *c) {
4837 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4838 }
4839
4840 #define REDIS_OP_UNION 0
4841 #define REDIS_OP_DIFF 1
4842 #define REDIS_OP_INTER 2
4843
4844 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4845 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4846 dictIterator *di;
4847 dictEntry *de;
4848 robj *dstset = NULL;
4849 int j, cardinality = 0;
4850
4851 for (j = 0; j < setsnum; j++) {
4852 robj *setobj;
4853
4854 setobj = dstkey ?
4855 lookupKeyWrite(c->db,setskeys[j]) :
4856 lookupKeyRead(c->db,setskeys[j]);
4857 if (!setobj) {
4858 dv[j] = NULL;
4859 continue;
4860 }
4861 if (setobj->type != REDIS_SET) {
4862 zfree(dv);
4863 addReply(c,shared.wrongtypeerr);
4864 return;
4865 }
4866 dv[j] = setobj->ptr;
4867 }
4868
4869 /* We need a temp set object to store our union. If the dstkey
4870 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4871 * this set object will be the resulting object to set into the target key*/
4872 dstset = createSetObject();
4873
4874 /* Iterate all the elements of all the sets, add every element a single
4875 * time to the result set */
4876 for (j = 0; j < setsnum; j++) {
4877 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4878 if (!dv[j]) continue; /* non existing keys are like empty sets */
4879
4880 di = dictGetIterator(dv[j]);
4881
4882 while((de = dictNext(di)) != NULL) {
4883 robj *ele;
4884
4885 /* dictAdd will not add the same element multiple times */
4886 ele = dictGetEntryKey(de);
4887 if (op == REDIS_OP_UNION || j == 0) {
4888 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4889 incrRefCount(ele);
4890 cardinality++;
4891 }
4892 } else if (op == REDIS_OP_DIFF) {
4893 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4894 cardinality--;
4895 }
4896 }
4897 }
4898 dictReleaseIterator(di);
4899
4900 /* result set is empty? Exit asap. */
4901 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4902 }
4903
4904 /* Output the content of the resulting set, if not in STORE mode */
4905 if (!dstkey) {
4906 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4907 di = dictGetIterator(dstset->ptr);
4908 while((de = dictNext(di)) != NULL) {
4909 robj *ele;
4910
4911 ele = dictGetEntryKey(de);
4912 addReplyBulk(c,ele);
4913 }
4914 dictReleaseIterator(di);
4915 decrRefCount(dstset);
4916 } else {
4917 /* If we have a target key where to store the resulting set
4918 * create this key with the result set inside */
4919 deleteKey(c->db,dstkey);
4920 if (dictSize((dict*)dstset->ptr) > 0) {
4921 dictAdd(c->db->dict,dstkey,dstset);
4922 incrRefCount(dstkey);
4923 addReplyLong(c,dictSize((dict*)dstset->ptr));
4924 } else {
4925 decrRefCount(dstset);
4926 addReply(c,shared.czero);
4927 }
4928 server.dirty++;
4929 }
4930 zfree(dv);
4931 }
4932
4933 static void sunionCommand(redisClient *c) {
4934 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4935 }
4936
4937 static void sunionstoreCommand(redisClient *c) {
4938 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4939 }
4940
4941 static void sdiffCommand(redisClient *c) {
4942 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4943 }
4944
4945 static void sdiffstoreCommand(redisClient *c) {
4946 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4947 }
4948
4949 /* ==================================== ZSets =============================== */
4950
4951 /* ZSETs are ordered sets using two data structures to hold the same elements
4952 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4953 * data structure.
4954 *
4955 * The elements are added to an hash table mapping Redis objects to scores.
4956 * At the same time the elements are added to a skip list mapping scores
4957 * to Redis objects (so objects are sorted by scores in this "view"). */
4958
4959 /* This skiplist implementation is almost a C translation of the original
4960 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4961 * Alternative to Balanced Trees", modified in three ways:
4962 * a) this implementation allows for repeated values.
4963 * b) the comparison is not just by key (our 'score') but by satellite data.
4964 * c) there is a back pointer, so it's a doubly linked list with the back
4965 * pointers being only at "level 1". This allows to traverse the list
4966 * from tail to head, useful for ZREVRANGE. */
4967
4968 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4969 zskiplistNode *zn = zmalloc(sizeof(*zn));
4970
4971 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4972 if (level > 0)
4973 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
4974 zn->score = score;
4975 zn->obj = obj;
4976 return zn;
4977 }
4978
4979 static zskiplist *zslCreate(void) {
4980 int j;
4981 zskiplist *zsl;
4982
4983 zsl = zmalloc(sizeof(*zsl));
4984 zsl->level = 1;
4985 zsl->length = 0;
4986 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4987 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4988 zsl->header->forward[j] = NULL;
4989
4990 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4991 if (j < ZSKIPLIST_MAXLEVEL-1)
4992 zsl->header->span[j] = 0;
4993 }
4994 zsl->header->backward = NULL;
4995 zsl->tail = NULL;
4996 return zsl;
4997 }
4998
4999 static void zslFreeNode(zskiplistNode *node) {
5000 decrRefCount(node->obj);
5001 zfree(node->forward);
5002 zfree(node->span);
5003 zfree(node);
5004 }
5005
5006 static void zslFree(zskiplist *zsl) {
5007 zskiplistNode *node = zsl->header->forward[0], *next;
5008
5009 zfree(zsl->header->forward);
5010 zfree(zsl->header->span);
5011 zfree(zsl->header);
5012 while(node) {
5013 next = node->forward[0];
5014 zslFreeNode(node);
5015 node = next;
5016 }
5017 zfree(zsl);
5018 }
5019
5020 static int zslRandomLevel(void) {
5021 int level = 1;
5022 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5023 level += 1;
5024 return level;
5025 }
5026
5027 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5028 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5029 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5030 int i, level;
5031
5032 x = zsl->header;
5033 for (i = zsl->level-1; i >= 0; i--) {
5034 /* store rank that is crossed to reach the insert position */
5035 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5036
5037 while (x->forward[i] &&
5038 (x->forward[i]->score < score ||
5039 (x->forward[i]->score == score &&
5040 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5041 rank[i] += i > 0 ? x->span[i-1] : 1;
5042 x = x->forward[i];
5043 }
5044 update[i] = x;
5045 }
5046 /* we assume the key is not already inside, since we allow duplicated
5047 * scores, and the re-insertion of score and redis object should never
5048 * happpen since the caller of zslInsert() should test in the hash table
5049 * if the element is already inside or not. */
5050 level = zslRandomLevel();
5051 if (level > zsl->level) {
5052 for (i = zsl->level; i < level; i++) {
5053 rank[i] = 0;
5054 update[i] = zsl->header;
5055 update[i]->span[i-1] = zsl->length;
5056 }
5057 zsl->level = level;
5058 }
5059 x = zslCreateNode(level,score,obj);
5060 for (i = 0; i < level; i++) {
5061 x->forward[i] = update[i]->forward[i];
5062 update[i]->forward[i] = x;
5063
5064 /* update span covered by update[i] as x is inserted here */
5065 if (i > 0) {
5066 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5067 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5068 }
5069 }
5070
5071 /* increment span for untouched levels */
5072 for (i = level; i < zsl->level; i++) {
5073 update[i]->span[i-1]++;
5074 }
5075
5076 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5077 if (x->forward[0])
5078 x->forward[0]->backward = x;
5079 else
5080 zsl->tail = x;
5081 zsl->length++;
5082 }
5083
5084 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5085 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5086 int i;
5087 for (i = 0; i < zsl->level; i++) {
5088 if (update[i]->forward[i] == x) {
5089 if (i > 0) {
5090 update[i]->span[i-1] += x->span[i-1] - 1;
5091 }
5092 update[i]->forward[i] = x->forward[i];
5093 } else {
5094 /* invariant: i > 0, because update[0]->forward[0]
5095 * is always equal to x */
5096 update[i]->span[i-1] -= 1;
5097 }
5098 }
5099 if (x->forward[0]) {
5100 x->forward[0]->backward = x->backward;
5101 } else {
5102 zsl->tail = x->backward;
5103 }
5104 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5105 zsl->level--;
5106 zsl->length--;
5107 }
5108
5109 /* Delete an element with matching score/object from the skiplist. */
5110 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5111 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5112 int i;
5113
5114 x = zsl->header;
5115 for (i = zsl->level-1; i >= 0; i--) {
5116 while (x->forward[i] &&
5117 (x->forward[i]->score < score ||
5118 (x->forward[i]->score == score &&
5119 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5120 x = x->forward[i];
5121 update[i] = x;
5122 }
5123 /* We may have multiple elements with the same score, what we need
5124 * is to find the element with both the right score and object. */
5125 x = x->forward[0];
5126 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5127 zslDeleteNode(zsl, x, update);
5128 zslFreeNode(x);
5129 return 1;
5130 } else {
5131 return 0; /* not found */
5132 }
5133 return 0; /* not found */
5134 }
5135
5136 /* Delete all the elements with score between min and max from the skiplist.
5137 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5138 * Note that this function takes the reference to the hash table view of the
5139 * sorted set, in order to remove the elements from the hash table too. */
5140 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5141 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5142 unsigned long removed = 0;
5143 int i;
5144
5145 x = zsl->header;
5146 for (i = zsl->level-1; i >= 0; i--) {
5147 while (x->forward[i] && x->forward[i]->score < min)
5148 x = x->forward[i];
5149 update[i] = x;
5150 }
5151 /* We may have multiple elements with the same score, what we need
5152 * is to find the element with both the right score and object. */
5153 x = x->forward[0];
5154 while (x && x->score <= max) {
5155 zskiplistNode *next = x->forward[0];
5156 zslDeleteNode(zsl, x, update);
5157 dictDelete(dict,x->obj);
5158 zslFreeNode(x);
5159 removed++;
5160 x = next;
5161 }
5162 return removed; /* not found */
5163 }
5164
5165 /* Delete all the elements with rank between start and end from the skiplist.
5166 * Start and end are inclusive. Note that start and end need to be 1-based */
5167 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5168 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5169 unsigned long traversed = 0, removed = 0;
5170 int i;
5171
5172 x = zsl->header;
5173 for (i = zsl->level-1; i >= 0; i--) {
5174 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5175 traversed += i > 0 ? x->span[i-1] : 1;
5176 x = x->forward[i];
5177 }
5178 update[i] = x;
5179 }
5180
5181 traversed++;
5182 x = x->forward[0];
5183 while (x && traversed <= end) {
5184 zskiplistNode *next = x->forward[0];
5185 zslDeleteNode(zsl, x, update);
5186 dictDelete(dict,x->obj);
5187 zslFreeNode(x);
5188 removed++;
5189 traversed++;
5190 x = next;
5191 }
5192 return removed;
5193 }
5194
5195 /* Find the first node having a score equal or greater than the specified one.
5196 * Returns NULL if there is no match. */
5197 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5198 zskiplistNode *x;
5199 int i;
5200
5201 x = zsl->header;
5202 for (i = zsl->level-1; i >= 0; i--) {
5203 while (x->forward[i] && x->forward[i]->score < score)
5204 x = x->forward[i];
5205 }
5206 /* We may have multiple elements with the same score, what we need
5207 * is to find the element with both the right score and object. */
5208 return x->forward[0];
5209 }
5210
5211 /* Find the rank for an element by both score and key.
5212 * Returns 0 when the element cannot be found, rank otherwise.
5213 * Note that the rank is 1-based due to the span of zsl->header to the
5214 * first element. */
5215 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5216 zskiplistNode *x;
5217 unsigned long rank = 0;
5218 int i;
5219
5220 x = zsl->header;
5221 for (i = zsl->level-1; i >= 0; i--) {
5222 while (x->forward[i] &&
5223 (x->forward[i]->score < score ||
5224 (x->forward[i]->score == score &&
5225 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5226 rank += i > 0 ? x->span[i-1] : 1;
5227 x = x->forward[i];
5228 }
5229
5230 /* x might be equal to zsl->header, so test if obj is non-NULL */
5231 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5232 return rank;
5233 }
5234 }
5235 return 0;
5236 }
5237
5238 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5239 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5240 zskiplistNode *x;
5241 unsigned long traversed = 0;
5242 int i;
5243
5244 x = zsl->header;
5245 for (i = zsl->level-1; i >= 0; i--) {
5246 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5247 {
5248 traversed += i > 0 ? x->span[i-1] : 1;
5249 x = x->forward[i];
5250 }
5251 if (traversed == rank) {
5252 return x;
5253 }
5254 }
5255 return NULL;
5256 }
5257
5258 /* The actual Z-commands implementations */
5259
5260 /* This generic command implements both ZADD and ZINCRBY.
5261 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5262 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5263 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5264 robj *zsetobj;
5265 zset *zs;
5266 double *score;
5267
5268 zsetobj = lookupKeyWrite(c->db,key);
5269 if (zsetobj == NULL) {
5270 zsetobj = createZsetObject();
5271 dictAdd(c->db->dict,key,zsetobj);
5272 incrRefCount(key);
5273 } else {
5274 if (zsetobj->type != REDIS_ZSET) {
5275 addReply(c,shared.wrongtypeerr);
5276 return;
5277 }
5278 }
5279 zs = zsetobj->ptr;
5280
5281 /* Ok now since we implement both ZADD and ZINCRBY here the code
5282 * needs to handle the two different conditions. It's all about setting
5283 * '*score', that is, the new score to set, to the right value. */
5284 score = zmalloc(sizeof(double));
5285 if (doincrement) {
5286 dictEntry *de;
5287
5288 /* Read the old score. If the element was not present starts from 0 */
5289 de = dictFind(zs->dict,ele);
5290 if (de) {
5291 double *oldscore = dictGetEntryVal(de);
5292 *score = *oldscore + scoreval;
5293 } else {
5294 *score = scoreval;
5295 }
5296 } else {
5297 *score = scoreval;
5298 }
5299
5300 /* What follows is a simple remove and re-insert operation that is common
5301 * to both ZADD and ZINCRBY... */
5302 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5303 /* case 1: New element */
5304 incrRefCount(ele); /* added to hash */
5305 zslInsert(zs->zsl,*score,ele);
5306 incrRefCount(ele); /* added to skiplist */
5307 server.dirty++;
5308 if (doincrement)
5309 addReplyDouble(c,*score);
5310 else
5311 addReply(c,shared.cone);
5312 } else {
5313 dictEntry *de;
5314 double *oldscore;
5315
5316 /* case 2: Score update operation */
5317 de = dictFind(zs->dict,ele);
5318 redisAssert(de != NULL);
5319 oldscore = dictGetEntryVal(de);
5320 if (*score != *oldscore) {
5321 int deleted;
5322
5323 /* Remove and insert the element in the skip list with new score */
5324 deleted = zslDelete(zs->zsl,*oldscore,ele);
5325 redisAssert(deleted != 0);
5326 zslInsert(zs->zsl,*score,ele);
5327 incrRefCount(ele);
5328 /* Update the score in the hash table */
5329 dictReplace(zs->dict,ele,score);
5330 server.dirty++;
5331 } else {
5332 zfree(score);
5333 }
5334 if (doincrement)
5335 addReplyDouble(c,*score);
5336 else
5337 addReply(c,shared.czero);
5338 }
5339 }
5340
5341 static void zaddCommand(redisClient *c) {
5342 double scoreval;
5343
5344 scoreval = strtod(c->argv[2]->ptr,NULL);
5345 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5346 }
5347
5348 static void zincrbyCommand(redisClient *c) {
5349 double scoreval;
5350
5351 scoreval = strtod(c->argv[2]->ptr,NULL);
5352 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5353 }
5354
5355 static void zremCommand(redisClient *c) {
5356 robj *zsetobj;
5357 zset *zs;
5358 dictEntry *de;
5359 double *oldscore;
5360 int deleted;
5361
5362 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5363 checkType(c,zsetobj,REDIS_ZSET)) return;
5364
5365 zs = zsetobj->ptr;
5366 de = dictFind(zs->dict,c->argv[2]);
5367 if (de == NULL) {
5368 addReply(c,shared.czero);
5369 return;
5370 }
5371 /* Delete from the skiplist */
5372 oldscore = dictGetEntryVal(de);
5373 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5374 redisAssert(deleted != 0);
5375
5376 /* Delete from the hash table */
5377 dictDelete(zs->dict,c->argv[2]);
5378 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5379 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5380 server.dirty++;
5381 addReply(c,shared.cone);
5382 }
5383
5384 static void zremrangebyscoreCommand(redisClient *c) {
5385 double min = strtod(c->argv[2]->ptr,NULL);
5386 double max = strtod(c->argv[3]->ptr,NULL);
5387 long deleted;
5388 robj *zsetobj;
5389 zset *zs;
5390
5391 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5392 checkType(c,zsetobj,REDIS_ZSET)) return;
5393
5394 zs = zsetobj->ptr;
5395 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5396 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5397 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5398 server.dirty += deleted;
5399 addReplyLong(c,deleted);
5400 }
5401
5402 static void zremrangebyrankCommand(redisClient *c) {
5403 int start = atoi(c->argv[2]->ptr);
5404 int end = atoi(c->argv[3]->ptr);
5405 int llen;
5406 long deleted;
5407 robj *zsetobj;
5408 zset *zs;
5409
5410 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5411 checkType(c,zsetobj,REDIS_ZSET)) return;
5412 zs = zsetobj->ptr;
5413 llen = zs->zsl->length;
5414
5415 /* convert negative indexes */
5416 if (start < 0) start = llen+start;
5417 if (end < 0) end = llen+end;
5418 if (start < 0) start = 0;
5419 if (end < 0) end = 0;
5420
5421 /* indexes sanity checks */
5422 if (start > end || start >= llen) {
5423 addReply(c,shared.czero);
5424 return;
5425 }
5426 if (end >= llen) end = llen-1;
5427
5428 /* increment start and end because zsl*Rank functions
5429 * use 1-based rank */
5430 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5431 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5432 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5433 server.dirty += deleted;
5434 addReplyLong(c, deleted);
5435 }
5436
5437 typedef struct {
5438 dict *dict;
5439 double weight;
5440 } zsetopsrc;
5441
5442 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5443 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5444 unsigned long size1, size2;
5445 size1 = d1->dict ? dictSize(d1->dict) : 0;
5446 size2 = d2->dict ? dictSize(d2->dict) : 0;
5447 return size1 - size2;
5448 }
5449
5450 #define REDIS_AGGR_SUM 1
5451 #define REDIS_AGGR_MIN 2
5452 #define REDIS_AGGR_MAX 3
5453
5454 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5455 if (aggregate == REDIS_AGGR_SUM) {
5456 *target = *target + val;
5457 } else if (aggregate == REDIS_AGGR_MIN) {
5458 *target = val < *target ? val : *target;
5459 } else if (aggregate == REDIS_AGGR_MAX) {
5460 *target = val > *target ? val : *target;
5461 } else {
5462 /* safety net */
5463 redisAssert(0 != 0);
5464 }
5465 }
5466
5467 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5468 int i, j, zsetnum;
5469 int aggregate = REDIS_AGGR_SUM;
5470 zsetopsrc *src;
5471 robj *dstobj;
5472 zset *dstzset;
5473 dictIterator *di;
5474 dictEntry *de;
5475
5476 /* expect zsetnum input keys to be given */
5477 zsetnum = atoi(c->argv[2]->ptr);
5478 if (zsetnum < 1) {
5479 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5480 return;
5481 }
5482
5483 /* test if the expected number of keys would overflow */
5484 if (3+zsetnum > c->argc) {
5485 addReply(c,shared.syntaxerr);
5486 return;
5487 }
5488
5489 /* read keys to be used for input */
5490 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5491 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5492 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5493 if (!zsetobj) {
5494 src[i].dict = NULL;
5495 } else {
5496 if (zsetobj->type != REDIS_ZSET) {
5497 zfree(src);
5498 addReply(c,shared.wrongtypeerr);
5499 return;
5500 }
5501 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5502 }
5503
5504 /* default all weights to 1 */
5505 src[i].weight = 1.0;
5506 }
5507
5508 /* parse optional extra arguments */
5509 if (j < c->argc) {
5510 int remaining = c->argc - j;
5511
5512 while (remaining) {
5513 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5514 j++; remaining--;
5515 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5516 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5517 }
5518 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5519 j++; remaining--;
5520 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5521 aggregate = REDIS_AGGR_SUM;
5522 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5523 aggregate = REDIS_AGGR_MIN;
5524 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5525 aggregate = REDIS_AGGR_MAX;
5526 } else {
5527 zfree(src);
5528 addReply(c,shared.syntaxerr);
5529 return;
5530 }
5531 j++; remaining--;
5532 } else {
5533 zfree(src);
5534 addReply(c,shared.syntaxerr);
5535 return;
5536 }
5537 }
5538 }
5539
5540 /* sort sets from the smallest to largest, this will improve our
5541 * algorithm's performance */
5542 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5543
5544 dstobj = createZsetObject();
5545 dstzset = dstobj->ptr;
5546
5547 if (op == REDIS_OP_INTER) {
5548 /* skip going over all entries if the smallest zset is NULL or empty */
5549 if (src[0].dict && dictSize(src[0].dict) > 0) {
5550 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5551 * from small to large, all src[i > 0].dict are non-empty too */
5552 di = dictGetIterator(src[0].dict);
5553 while((de = dictNext(di)) != NULL) {
5554 double *score = zmalloc(sizeof(double)), value;
5555 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5556
5557 for (j = 1; j < zsetnum; j++) {
5558 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5559 if (other) {
5560 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5561 zunionInterAggregate(score, value, aggregate);
5562 } else {
5563 break;
5564 }
5565 }
5566
5567 /* skip entry when not present in every source dict */
5568 if (j != zsetnum) {
5569 zfree(score);
5570 } else {
5571 robj *o = dictGetEntryKey(de);
5572 dictAdd(dstzset->dict,o,score);
5573 incrRefCount(o); /* added to dictionary */
5574 zslInsert(dstzset->zsl,*score,o);
5575 incrRefCount(o); /* added to skiplist */
5576 }
5577 }
5578 dictReleaseIterator(di);
5579 }
5580 } else if (op == REDIS_OP_UNION) {
5581 for (i = 0; i < zsetnum; i++) {
5582 if (!src[i].dict) continue;
5583
5584 di = dictGetIterator(src[i].dict);
5585 while((de = dictNext(di)) != NULL) {
5586 /* skip key when already processed */
5587 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5588
5589 double *score = zmalloc(sizeof(double)), value;
5590 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5591
5592 /* because the zsets are sorted by size, its only possible
5593 * for sets at larger indices to hold this entry */
5594 for (j = (i+1); j < zsetnum; j++) {
5595 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5596 if (other) {
5597 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5598 zunionInterAggregate(score, value, aggregate);
5599 }
5600 }
5601
5602 robj *o = dictGetEntryKey(de);
5603 dictAdd(dstzset->dict,o,score);
5604 incrRefCount(o); /* added to dictionary */
5605 zslInsert(dstzset->zsl,*score,o);
5606 incrRefCount(o); /* added to skiplist */
5607 }
5608 dictReleaseIterator(di);
5609 }
5610 } else {
5611 /* unknown operator */
5612 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5613 }
5614
5615 deleteKey(c->db,dstkey);
5616 if (dstzset->zsl->length) {
5617 dictAdd(c->db->dict,dstkey,dstobj);
5618 incrRefCount(dstkey);
5619 addReplyLong(c, dstzset->zsl->length);
5620 server.dirty++;
5621 } else {
5622 decrRefCount(dstzset);
5623 addReply(c, shared.czero);
5624 }
5625 zfree(src);
5626 }
5627
5628 static void zunionCommand(redisClient *c) {
5629 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5630 }
5631
5632 static void zinterCommand(redisClient *c) {
5633 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5634 }
5635
5636 static void zrangeGenericCommand(redisClient *c, int reverse) {
5637 robj *o;
5638 int start = atoi(c->argv[2]->ptr);
5639 int end = atoi(c->argv[3]->ptr);
5640 int withscores = 0;
5641 int llen;
5642 int rangelen, j;
5643 zset *zsetobj;
5644 zskiplist *zsl;
5645 zskiplistNode *ln;
5646 robj *ele;
5647
5648 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5649 withscores = 1;
5650 } else if (c->argc >= 5) {
5651 addReply(c,shared.syntaxerr);
5652 return;
5653 }
5654
5655 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5656 checkType(c,o,REDIS_ZSET)) return;
5657 zsetobj = o->ptr;
5658 zsl = zsetobj->zsl;
5659 llen = zsl->length;
5660
5661 /* convert negative indexes */
5662 if (start < 0) start = llen+start;
5663 if (end < 0) end = llen+end;
5664 if (start < 0) start = 0;
5665 if (end < 0) end = 0;
5666
5667 /* indexes sanity checks */
5668 if (start > end || start >= llen) {
5669 /* Out of range start or start > end result in empty list */
5670 addReply(c,shared.emptymultibulk);
5671 return;
5672 }
5673 if (end >= llen) end = llen-1;
5674 rangelen = (end-start)+1;
5675
5676 /* check if starting point is trivial, before searching
5677 * the element in log(N) time */
5678 if (reverse) {
5679 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5680 } else {
5681 ln = start == 0 ?
5682 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5683 }
5684
5685 /* Return the result in form of a multi-bulk reply */
5686 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5687 withscores ? (rangelen*2) : rangelen));
5688 for (j = 0; j < rangelen; j++) {
5689 ele = ln->obj;
5690 addReplyBulk(c,ele);
5691 if (withscores)
5692 addReplyDouble(c,ln->score);
5693 ln = reverse ? ln->backward : ln->forward[0];
5694 }
5695 }
5696
5697 static void zrangeCommand(redisClient *c) {
5698 zrangeGenericCommand(c,0);
5699 }
5700
5701 static void zrevrangeCommand(redisClient *c) {
5702 zrangeGenericCommand(c,1);
5703 }
5704
5705 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5706 * If justcount is non-zero, just the count is returned. */
5707 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5708 robj *o;
5709 double min, max;
5710 int minex = 0, maxex = 0; /* are min or max exclusive? */
5711 int offset = 0, limit = -1;
5712 int withscores = 0;
5713 int badsyntax = 0;
5714
5715 /* Parse the min-max interval. If one of the values is prefixed
5716 * by the "(" character, it's considered "open". For instance
5717 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5718 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5719 if (((char*)c->argv[2]->ptr)[0] == '(') {
5720 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5721 minex = 1;
5722 } else {
5723 min = strtod(c->argv[2]->ptr,NULL);
5724 }
5725 if (((char*)c->argv[3]->ptr)[0] == '(') {
5726 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5727 maxex = 1;
5728 } else {
5729 max = strtod(c->argv[3]->ptr,NULL);
5730 }
5731
5732 /* Parse "WITHSCORES": note that if the command was called with
5733 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5734 * enter the following paths to parse WITHSCORES and LIMIT. */
5735 if (c->argc == 5 || c->argc == 8) {
5736 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5737 withscores = 1;
5738 else
5739 badsyntax = 1;
5740 }
5741 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5742 badsyntax = 1;
5743 if (badsyntax) {
5744 addReplySds(c,
5745 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5746 return;
5747 }
5748
5749 /* Parse "LIMIT" */
5750 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5751 addReply(c,shared.syntaxerr);
5752 return;
5753 } else if (c->argc == (7 + withscores)) {
5754 offset = atoi(c->argv[5]->ptr);
5755 limit = atoi(c->argv[6]->ptr);
5756 if (offset < 0) offset = 0;
5757 }
5758
5759 /* Ok, lookup the key and get the range */
5760 o = lookupKeyRead(c->db,c->argv[1]);
5761 if (o == NULL) {
5762 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5763 } else {
5764 if (o->type != REDIS_ZSET) {
5765 addReply(c,shared.wrongtypeerr);
5766 } else {
5767 zset *zsetobj = o->ptr;
5768 zskiplist *zsl = zsetobj->zsl;
5769 zskiplistNode *ln;
5770 robj *ele, *lenobj = NULL;
5771 unsigned long rangelen = 0;
5772
5773 /* Get the first node with the score >= min, or with
5774 * score > min if 'minex' is true. */
5775 ln = zslFirstWithScore(zsl,min);
5776 while (minex && ln && ln->score == min) ln = ln->forward[0];
5777
5778 if (ln == NULL) {
5779 /* No element matching the speciifed interval */
5780 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5781 return;
5782 }
5783
5784 /* We don't know in advance how many matching elements there
5785 * are in the list, so we push this object that will represent
5786 * the multi-bulk length in the output buffer, and will "fix"
5787 * it later */
5788 if (!justcount) {
5789 lenobj = createObject(REDIS_STRING,NULL);
5790 addReply(c,lenobj);
5791 decrRefCount(lenobj);
5792 }
5793
5794 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5795 if (offset) {
5796 offset--;
5797 ln = ln->forward[0];
5798 continue;
5799 }
5800 if (limit == 0) break;
5801 if (!justcount) {
5802 ele = ln->obj;
5803 addReplyBulk(c,ele);
5804 if (withscores)
5805 addReplyDouble(c,ln->score);
5806 }
5807 ln = ln->forward[0];
5808 rangelen++;
5809 if (limit > 0) limit--;
5810 }
5811 if (justcount) {
5812 addReplyLong(c,(long)rangelen);
5813 } else {
5814 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5815 withscores ? (rangelen*2) : rangelen);
5816 }
5817 }
5818 }
5819 }
5820
5821 static void zrangebyscoreCommand(redisClient *c) {
5822 genericZrangebyscoreCommand(c,0);
5823 }
5824
5825 static void zcountCommand(redisClient *c) {
5826 genericZrangebyscoreCommand(c,1);
5827 }
5828
5829 static void zcardCommand(redisClient *c) {
5830 robj *o;
5831 zset *zs;
5832
5833 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5834 checkType(c,o,REDIS_ZSET)) return;
5835
5836 zs = o->ptr;
5837 addReplyUlong(c,zs->zsl->length);
5838 }
5839
5840 static void zscoreCommand(redisClient *c) {
5841 robj *o;
5842 zset *zs;
5843 dictEntry *de;
5844
5845 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5846 checkType(c,o,REDIS_ZSET)) return;
5847
5848 zs = o->ptr;
5849 de = dictFind(zs->dict,c->argv[2]);
5850 if (!de) {
5851 addReply(c,shared.nullbulk);
5852 } else {
5853 double *score = dictGetEntryVal(de);
5854
5855 addReplyDouble(c,*score);
5856 }
5857 }
5858
5859 static void zrankGenericCommand(redisClient *c, int reverse) {
5860 robj *o;
5861 zset *zs;
5862 zskiplist *zsl;
5863 dictEntry *de;
5864 unsigned long rank;
5865 double *score;
5866
5867 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5868 checkType(c,o,REDIS_ZSET)) return;
5869
5870 zs = o->ptr;
5871 zsl = zs->zsl;
5872 de = dictFind(zs->dict,c->argv[2]);
5873 if (!de) {
5874 addReply(c,shared.nullbulk);
5875 return;
5876 }
5877
5878 score = dictGetEntryVal(de);
5879 rank = zslGetRank(zsl, *score, c->argv[2]);
5880 if (rank) {
5881 if (reverse) {
5882 addReplyLong(c, zsl->length - rank);
5883 } else {
5884 addReplyLong(c, rank-1);
5885 }
5886 } else {
5887 addReply(c,shared.nullbulk);
5888 }
5889 }
5890
5891 static void zrankCommand(redisClient *c) {
5892 zrankGenericCommand(c, 0);
5893 }
5894
5895 static void zrevrankCommand(redisClient *c) {
5896 zrankGenericCommand(c, 1);
5897 }
5898
5899 /* =================================== Hashes =============================== */
5900 static void hsetCommand(redisClient *c) {
5901 int update = 0;
5902 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5903
5904 if (o == NULL) {
5905 o = createHashObject();
5906 dictAdd(c->db->dict,c->argv[1],o);
5907 incrRefCount(c->argv[1]);
5908 } else {
5909 if (o->type != REDIS_HASH) {
5910 addReply(c,shared.wrongtypeerr);
5911 return;
5912 }
5913 }
5914 /* We want to convert the zipmap into an hash table right now if the
5915 * entry to be added is too big. Note that we check if the object
5916 * is integer encoded before to try fetching the length in the test below.
5917 * This is because integers are small, but currently stringObjectLen()
5918 * performs a slow conversion: not worth it. */
5919 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5920 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5921 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5922 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5923 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5924 {
5925 convertToRealHash(o);
5926 }
5927
5928 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5929 unsigned char *zm = o->ptr;
5930 robj *valobj = getDecodedObject(c->argv[3]);
5931
5932 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5933 valobj->ptr,sdslen(valobj->ptr),&update);
5934 decrRefCount(valobj);
5935 o->ptr = zm;
5936
5937 /* And here there is the second check for hash conversion...
5938 * we want to do it only if the operation was not just an update as
5939 * zipmapLen() is O(N). */
5940 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
5941 convertToRealHash(o);
5942 } else {
5943 tryObjectEncoding(c->argv[2]);
5944 /* note that c->argv[3] is already encoded, as the latest arg
5945 * of a bulk command is always integer encoded if possible. */
5946 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5947 incrRefCount(c->argv[2]);
5948 } else {
5949 update = 1;
5950 }
5951 incrRefCount(c->argv[3]);
5952 }
5953 server.dirty++;
5954 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5955 }
5956
5957 static void hgetCommand(redisClient *c) {
5958 robj *o;
5959
5960 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5961 checkType(c,o,REDIS_HASH)) return;
5962
5963 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5964 unsigned char *zm = o->ptr;
5965 unsigned char *val;
5966 unsigned int vlen;
5967 robj *field;
5968
5969 field = getDecodedObject(c->argv[2]);
5970 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
5971 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
5972 addReplySds(c,sdsnewlen(val,vlen));
5973 addReply(c,shared.crlf);
5974 decrRefCount(field);
5975 return;
5976 } else {
5977 addReply(c,shared.nullbulk);
5978 decrRefCount(field);
5979 return;
5980 }
5981 } else {
5982 struct dictEntry *de;
5983
5984 de = dictFind(o->ptr,c->argv[2]);
5985 if (de == NULL) {
5986 addReply(c,shared.nullbulk);
5987 } else {
5988 robj *e = dictGetEntryVal(de);
5989
5990 addReplyBulk(c,e);
5991 }
5992 }
5993 }
5994
5995 static void hdelCommand(redisClient *c) {
5996 robj *o;
5997 int deleted = 0;
5998
5999 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6000 checkType(c,o,REDIS_HASH)) return;
6001
6002 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6003 robj *field = getDecodedObject(c->argv[2]);
6004
6005 o->ptr = zipmapDel((unsigned char*) o->ptr,
6006 (unsigned char*) field->ptr,
6007 sdslen(field->ptr), &deleted);
6008 decrRefCount(field);
6009 if (zipmapLen((unsigned char*) o->ptr) == 0)
6010 deleteKey(c->db,c->argv[1]);
6011 } else {
6012 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6013 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6014 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6015 }
6016 if (deleted) server.dirty++;
6017 addReply(c,deleted ? shared.cone : shared.czero);
6018 }
6019
6020 static void hlenCommand(redisClient *c) {
6021 robj *o;
6022 unsigned long len;
6023
6024 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6025 checkType(c,o,REDIS_HASH)) return;
6026
6027 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6028 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6029 addReplyUlong(c,len);
6030 }
6031
6032 #define REDIS_GETALL_KEYS 1
6033 #define REDIS_GETALL_VALS 2
6034 static void genericHgetallCommand(redisClient *c, int flags) {
6035 robj *o, *lenobj;
6036 unsigned long count = 0;
6037
6038 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6039 || checkType(c,o,REDIS_HASH)) return;
6040
6041 lenobj = createObject(REDIS_STRING,NULL);
6042 addReply(c,lenobj);
6043 decrRefCount(lenobj);
6044
6045 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6046 unsigned char *p = zipmapRewind(o->ptr);
6047 unsigned char *field, *val;
6048 unsigned int flen, vlen;
6049
6050 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6051 robj *aux;
6052
6053 if (flags & REDIS_GETALL_KEYS) {
6054 aux = createStringObject((char*)field,flen);
6055 addReplyBulk(c,aux);
6056 decrRefCount(aux);
6057 count++;
6058 }
6059 if (flags & REDIS_GETALL_VALS) {
6060 aux = createStringObject((char*)val,vlen);
6061 addReplyBulk(c,aux);
6062 decrRefCount(aux);
6063 count++;
6064 }
6065 }
6066 } else {
6067 dictIterator *di = dictGetIterator(o->ptr);
6068 dictEntry *de;
6069
6070 while((de = dictNext(di)) != NULL) {
6071 robj *fieldobj = dictGetEntryKey(de);
6072 robj *valobj = dictGetEntryVal(de);
6073
6074 if (flags & REDIS_GETALL_KEYS) {
6075 addReplyBulk(c,fieldobj);
6076 count++;
6077 }
6078 if (flags & REDIS_GETALL_VALS) {
6079 addReplyBulk(c,valobj);
6080 count++;
6081 }
6082 }
6083 dictReleaseIterator(di);
6084 }
6085 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6086 }
6087
6088 static void hkeysCommand(redisClient *c) {
6089 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6090 }
6091
6092 static void hvalsCommand(redisClient *c) {
6093 genericHgetallCommand(c,REDIS_GETALL_VALS);
6094 }
6095
6096 static void hgetallCommand(redisClient *c) {
6097 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6098 }
6099
6100 static void hexistsCommand(redisClient *c) {
6101 robj *o;
6102 int exists = 0;
6103
6104 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6105 checkType(c,o,REDIS_HASH)) return;
6106
6107 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6108 robj *field;
6109 unsigned char *zm = o->ptr;
6110
6111 field = getDecodedObject(c->argv[2]);
6112 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6113 decrRefCount(field);
6114 } else {
6115 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6116 }
6117 addReply(c,exists ? shared.cone : shared.czero);
6118 }
6119
6120 static void convertToRealHash(robj *o) {
6121 unsigned char *key, *val, *p, *zm = o->ptr;
6122 unsigned int klen, vlen;
6123 dict *dict = dictCreate(&hashDictType,NULL);
6124
6125 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6126 p = zipmapRewind(zm);
6127 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6128 robj *keyobj, *valobj;
6129
6130 keyobj = createStringObject((char*)key,klen);
6131 valobj = createStringObject((char*)val,vlen);
6132 tryObjectEncoding(keyobj);
6133 tryObjectEncoding(valobj);
6134 dictAdd(dict,keyobj,valobj);
6135 }
6136 o->encoding = REDIS_ENCODING_HT;
6137 o->ptr = dict;
6138 zfree(zm);
6139 }
6140
6141 /* ========================= Non type-specific commands ==================== */
6142
6143 static void flushdbCommand(redisClient *c) {
6144 server.dirty += dictSize(c->db->dict);
6145 dictEmpty(c->db->dict);
6146 dictEmpty(c->db->expires);
6147 addReply(c,shared.ok);
6148 }
6149
6150 static void flushallCommand(redisClient *c) {
6151 server.dirty += emptyDb();
6152 addReply(c,shared.ok);
6153 if (server.bgsavechildpid != -1) {
6154 kill(server.bgsavechildpid,SIGKILL);
6155 rdbRemoveTempFile(server.bgsavechildpid);
6156 }
6157 rdbSave(server.dbfilename);
6158 server.dirty++;
6159 }
6160
6161 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6162 redisSortOperation *so = zmalloc(sizeof(*so));
6163 so->type = type;
6164 so->pattern = pattern;
6165 return so;
6166 }
6167
6168 /* Return the value associated to the key with a name obtained
6169 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6170 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6171 char *p;
6172 sds spat, ssub;
6173 robj keyobj;
6174 int prefixlen, sublen, postfixlen;
6175 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6176 struct {
6177 long len;
6178 long free;
6179 char buf[REDIS_SORTKEY_MAX+1];
6180 } keyname;
6181
6182 /* If the pattern is "#" return the substitution object itself in order
6183 * to implement the "SORT ... GET #" feature. */
6184 spat = pattern->ptr;
6185 if (spat[0] == '#' && spat[1] == '\0') {
6186 return subst;
6187 }
6188
6189 /* The substitution object may be specially encoded. If so we create
6190 * a decoded object on the fly. Otherwise getDecodedObject will just
6191 * increment the ref count, that we'll decrement later. */
6192 subst = getDecodedObject(subst);
6193
6194 ssub = subst->ptr;
6195 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6196 p = strchr(spat,'*');
6197 if (!p) {
6198 decrRefCount(subst);
6199 return NULL;
6200 }
6201
6202 prefixlen = p-spat;
6203 sublen = sdslen(ssub);
6204 postfixlen = sdslen(spat)-(prefixlen+1);
6205 memcpy(keyname.buf,spat,prefixlen);
6206 memcpy(keyname.buf+prefixlen,ssub,sublen);
6207 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6208 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6209 keyname.len = prefixlen+sublen+postfixlen;
6210
6211 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6212 decrRefCount(subst);
6213
6214 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6215 return lookupKeyRead(db,&keyobj);
6216 }
6217
6218 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6219 * the additional parameter is not standard but a BSD-specific we have to
6220 * pass sorting parameters via the global 'server' structure */
6221 static int sortCompare(const void *s1, const void *s2) {
6222 const redisSortObject *so1 = s1, *so2 = s2;
6223 int cmp;
6224
6225 if (!server.sort_alpha) {
6226 /* Numeric sorting. Here it's trivial as we precomputed scores */
6227 if (so1->u.score > so2->u.score) {
6228 cmp = 1;
6229 } else if (so1->u.score < so2->u.score) {
6230 cmp = -1;
6231 } else {
6232 cmp = 0;
6233 }
6234 } else {
6235 /* Alphanumeric sorting */
6236 if (server.sort_bypattern) {
6237 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6238 /* At least one compare object is NULL */
6239 if (so1->u.cmpobj == so2->u.cmpobj)
6240 cmp = 0;
6241 else if (so1->u.cmpobj == NULL)
6242 cmp = -1;
6243 else
6244 cmp = 1;
6245 } else {
6246 /* We have both the objects, use strcoll */
6247 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6248 }
6249 } else {
6250 /* Compare elements directly */
6251 robj *dec1, *dec2;
6252
6253 dec1 = getDecodedObject(so1->obj);
6254 dec2 = getDecodedObject(so2->obj);
6255 cmp = strcoll(dec1->ptr,dec2->ptr);
6256 decrRefCount(dec1);
6257 decrRefCount(dec2);
6258 }
6259 }
6260 return server.sort_desc ? -cmp : cmp;
6261 }
6262
6263 /* The SORT command is the most complex command in Redis. Warning: this code
6264 * is optimized for speed and a bit less for readability */
6265 static void sortCommand(redisClient *c) {
6266 list *operations;
6267 int outputlen = 0;
6268 int desc = 0, alpha = 0;
6269 int limit_start = 0, limit_count = -1, start, end;
6270 int j, dontsort = 0, vectorlen;
6271 int getop = 0; /* GET operation counter */
6272 robj *sortval, *sortby = NULL, *storekey = NULL;
6273 redisSortObject *vector; /* Resulting vector to sort */
6274
6275 /* Lookup the key to sort. It must be of the right types */
6276 sortval = lookupKeyRead(c->db,c->argv[1]);
6277 if (sortval == NULL) {
6278 addReply(c,shared.nullmultibulk);
6279 return;
6280 }
6281 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6282 sortval->type != REDIS_ZSET)
6283 {
6284 addReply(c,shared.wrongtypeerr);
6285 return;
6286 }
6287
6288 /* Create a list of operations to perform for every sorted element.
6289 * Operations can be GET/DEL/INCR/DECR */
6290 operations = listCreate();
6291 listSetFreeMethod(operations,zfree);
6292 j = 2;
6293
6294 /* Now we need to protect sortval incrementing its count, in the future
6295 * SORT may have options able to overwrite/delete keys during the sorting
6296 * and the sorted key itself may get destroied */
6297 incrRefCount(sortval);
6298
6299 /* The SORT command has an SQL-alike syntax, parse it */
6300 while(j < c->argc) {
6301 int leftargs = c->argc-j-1;
6302 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6303 desc = 0;
6304 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6305 desc = 1;
6306 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6307 alpha = 1;
6308 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6309 limit_start = atoi(c->argv[j+1]->ptr);
6310 limit_count = atoi(c->argv[j+2]->ptr);
6311 j+=2;
6312 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6313 storekey = c->argv[j+1];
6314 j++;
6315 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6316 sortby = c->argv[j+1];
6317 /* If the BY pattern does not contain '*', i.e. it is constant,
6318 * we don't need to sort nor to lookup the weight keys. */
6319 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6320 j++;
6321 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6322 listAddNodeTail(operations,createSortOperation(
6323 REDIS_SORT_GET,c->argv[j+1]));
6324 getop++;
6325 j++;
6326 } else {
6327 decrRefCount(sortval);
6328 listRelease(operations);
6329 addReply(c,shared.syntaxerr);
6330 return;
6331 }
6332 j++;
6333 }
6334
6335 /* Load the sorting vector with all the objects to sort */
6336 switch(sortval->type) {
6337 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6338 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6339 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6340 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6341 }
6342 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6343 j = 0;
6344
6345 if (sortval->type == REDIS_LIST) {
6346 list *list = sortval->ptr;
6347 listNode *ln;
6348 listIter li;
6349
6350 listRewind(list,&li);
6351 while((ln = listNext(&li))) {
6352 robj *ele = ln->value;
6353 vector[j].obj = ele;
6354 vector[j].u.score = 0;
6355 vector[j].u.cmpobj = NULL;
6356 j++;
6357 }
6358 } else {
6359 dict *set;
6360 dictIterator *di;
6361 dictEntry *setele;
6362
6363 if (sortval->type == REDIS_SET) {
6364 set = sortval->ptr;
6365 } else {
6366 zset *zs = sortval->ptr;
6367 set = zs->dict;
6368 }
6369
6370 di = dictGetIterator(set);
6371 while((setele = dictNext(di)) != NULL) {
6372 vector[j].obj = dictGetEntryKey(setele);
6373 vector[j].u.score = 0;
6374 vector[j].u.cmpobj = NULL;
6375 j++;
6376 }
6377 dictReleaseIterator(di);
6378 }
6379 redisAssert(j == vectorlen);
6380
6381 /* Now it's time to load the right scores in the sorting vector */
6382 if (dontsort == 0) {
6383 for (j = 0; j < vectorlen; j++) {
6384 if (sortby) {
6385 robj *byval;
6386
6387 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6388 if (!byval || byval->type != REDIS_STRING) continue;
6389 if (alpha) {
6390 vector[j].u.cmpobj = getDecodedObject(byval);
6391 } else {
6392 if (byval->encoding == REDIS_ENCODING_RAW) {
6393 vector[j].u.score = strtod(byval->ptr,NULL);
6394 } else {
6395 /* Don't need to decode the object if it's
6396 * integer-encoded (the only encoding supported) so
6397 * far. We can just cast it */
6398 if (byval->encoding == REDIS_ENCODING_INT) {
6399 vector[j].u.score = (long)byval->ptr;
6400 } else
6401 redisAssert(1 != 1);
6402 }
6403 }
6404 } else {
6405 if (!alpha) {
6406 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6407 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6408 else {
6409 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6410 vector[j].u.score = (long) vector[j].obj->ptr;
6411 else
6412 redisAssert(1 != 1);
6413 }
6414 }
6415 }
6416 }
6417 }
6418
6419 /* We are ready to sort the vector... perform a bit of sanity check
6420 * on the LIMIT option too. We'll use a partial version of quicksort. */
6421 start = (limit_start < 0) ? 0 : limit_start;
6422 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6423 if (start >= vectorlen) {
6424 start = vectorlen-1;
6425 end = vectorlen-2;
6426 }
6427 if (end >= vectorlen) end = vectorlen-1;
6428
6429 if (dontsort == 0) {
6430 server.sort_desc = desc;
6431 server.sort_alpha = alpha;
6432 server.sort_bypattern = sortby ? 1 : 0;
6433 if (sortby && (start != 0 || end != vectorlen-1))
6434 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6435 else
6436 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6437 }
6438
6439 /* Send command output to the output buffer, performing the specified
6440 * GET/DEL/INCR/DECR operations if any. */
6441 outputlen = getop ? getop*(end-start+1) : end-start+1;
6442 if (storekey == NULL) {
6443 /* STORE option not specified, sent the sorting result to client */
6444 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6445 for (j = start; j <= end; j++) {
6446 listNode *ln;
6447 listIter li;
6448
6449 if (!getop) addReplyBulk(c,vector[j].obj);
6450 listRewind(operations,&li);
6451 while((ln = listNext(&li))) {
6452 redisSortOperation *sop = ln->value;
6453 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6454 vector[j].obj);
6455
6456 if (sop->type == REDIS_SORT_GET) {
6457 if (!val || val->type != REDIS_STRING) {
6458 addReply(c,shared.nullbulk);
6459 } else {
6460 addReplyBulk(c,val);
6461 }
6462 } else {
6463 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6464 }
6465 }
6466 }
6467 } else {
6468 robj *listObject = createListObject();
6469 list *listPtr = (list*) listObject->ptr;
6470
6471 /* STORE option specified, set the sorting result as a List object */
6472 for (j = start; j <= end; j++) {
6473 listNode *ln;
6474 listIter li;
6475
6476 if (!getop) {
6477 listAddNodeTail(listPtr,vector[j].obj);
6478 incrRefCount(vector[j].obj);
6479 }
6480 listRewind(operations,&li);
6481 while((ln = listNext(&li))) {
6482 redisSortOperation *sop = ln->value;
6483 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6484 vector[j].obj);
6485
6486 if (sop->type == REDIS_SORT_GET) {
6487 if (!val || val->type != REDIS_STRING) {
6488 listAddNodeTail(listPtr,createStringObject("",0));
6489 } else {
6490 listAddNodeTail(listPtr,val);
6491 incrRefCount(val);
6492 }
6493 } else {
6494 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6495 }
6496 }
6497 }
6498 if (dictReplace(c->db->dict,storekey,listObject)) {
6499 incrRefCount(storekey);
6500 }
6501 /* Note: we add 1 because the DB is dirty anyway since even if the
6502 * SORT result is empty a new key is set and maybe the old content
6503 * replaced. */
6504 server.dirty += 1+outputlen;
6505 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6506 }
6507
6508 /* Cleanup */
6509 decrRefCount(sortval);
6510 listRelease(operations);
6511 for (j = 0; j < vectorlen; j++) {
6512 if (sortby && alpha && vector[j].u.cmpobj)
6513 decrRefCount(vector[j].u.cmpobj);
6514 }
6515 zfree(vector);
6516 }
6517
6518 /* Convert an amount of bytes into a human readable string in the form
6519 * of 100B, 2G, 100M, 4K, and so forth. */
6520 static void bytesToHuman(char *s, unsigned long long n) {
6521 double d;
6522
6523 if (n < 1024) {
6524 /* Bytes */
6525 sprintf(s,"%lluB",n);
6526 return;
6527 } else if (n < (1024*1024)) {
6528 d = (double)n/(1024);
6529 sprintf(s,"%.2fK",d);
6530 } else if (n < (1024LL*1024*1024)) {
6531 d = (double)n/(1024*1024);
6532 sprintf(s,"%.2fM",d);
6533 } else if (n < (1024LL*1024*1024*1024)) {
6534 d = (double)n/(1024LL*1024*1024);
6535 sprintf(s,"%.2fG",d);
6536 }
6537 }
6538
6539 /* Create the string returned by the INFO command. This is decoupled
6540 * by the INFO command itself as we need to report the same information
6541 * on memory corruption problems. */
6542 static sds genRedisInfoString(void) {
6543 sds info;
6544 time_t uptime = time(NULL)-server.stat_starttime;
6545 int j;
6546 char hmem[64];
6547
6548 bytesToHuman(hmem,zmalloc_used_memory());
6549 info = sdscatprintf(sdsempty(),
6550 "redis_version:%s\r\n"
6551 "arch_bits:%s\r\n"
6552 "multiplexing_api:%s\r\n"
6553 "process_id:%ld\r\n"
6554 "uptime_in_seconds:%ld\r\n"
6555 "uptime_in_days:%ld\r\n"
6556 "connected_clients:%d\r\n"
6557 "connected_slaves:%d\r\n"
6558 "blocked_clients:%d\r\n"
6559 "used_memory:%zu\r\n"
6560 "used_memory_human:%s\r\n"
6561 "changes_since_last_save:%lld\r\n"
6562 "bgsave_in_progress:%d\r\n"
6563 "last_save_time:%ld\r\n"
6564 "bgrewriteaof_in_progress:%d\r\n"
6565 "total_connections_received:%lld\r\n"
6566 "total_commands_processed:%lld\r\n"
6567 "expired_keys:%lld\r\n"
6568 "hash_max_zipmap_entries:%ld\r\n"
6569 "hash_max_zipmap_value:%ld\r\n"
6570 "vm_enabled:%d\r\n"
6571 "role:%s\r\n"
6572 ,REDIS_VERSION,
6573 (sizeof(long) == 8) ? "64" : "32",
6574 aeGetApiName(),
6575 (long) getpid(),
6576 uptime,
6577 uptime/(3600*24),
6578 listLength(server.clients)-listLength(server.slaves),
6579 listLength(server.slaves),
6580 server.blpop_blocked_clients,
6581 zmalloc_used_memory(),
6582 hmem,
6583 server.dirty,
6584 server.bgsavechildpid != -1,
6585 server.lastsave,
6586 server.bgrewritechildpid != -1,
6587 server.stat_numconnections,
6588 server.stat_numcommands,
6589 server.stat_expiredkeys,
6590 server.hash_max_zipmap_entries,
6591 server.hash_max_zipmap_value,
6592 server.vm_enabled != 0,
6593 server.masterhost == NULL ? "master" : "slave"
6594 );
6595 if (server.masterhost) {
6596 info = sdscatprintf(info,
6597 "master_host:%s\r\n"
6598 "master_port:%d\r\n"
6599 "master_link_status:%s\r\n"
6600 "master_last_io_seconds_ago:%d\r\n"
6601 ,server.masterhost,
6602 server.masterport,
6603 (server.replstate == REDIS_REPL_CONNECTED) ?
6604 "up" : "down",
6605 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6606 );
6607 }
6608 if (server.vm_enabled) {
6609 lockThreadedIO();
6610 info = sdscatprintf(info,
6611 "vm_conf_max_memory:%llu\r\n"
6612 "vm_conf_page_size:%llu\r\n"
6613 "vm_conf_pages:%llu\r\n"
6614 "vm_stats_used_pages:%llu\r\n"
6615 "vm_stats_swapped_objects:%llu\r\n"
6616 "vm_stats_swappin_count:%llu\r\n"
6617 "vm_stats_swappout_count:%llu\r\n"
6618 "vm_stats_io_newjobs_len:%lu\r\n"
6619 "vm_stats_io_processing_len:%lu\r\n"
6620 "vm_stats_io_processed_len:%lu\r\n"
6621 "vm_stats_io_active_threads:%lu\r\n"
6622 "vm_stats_blocked_clients:%lu\r\n"
6623 ,(unsigned long long) server.vm_max_memory,
6624 (unsigned long long) server.vm_page_size,
6625 (unsigned long long) server.vm_pages,
6626 (unsigned long long) server.vm_stats_used_pages,
6627 (unsigned long long) server.vm_stats_swapped_objects,
6628 (unsigned long long) server.vm_stats_swapins,
6629 (unsigned long long) server.vm_stats_swapouts,
6630 (unsigned long) listLength(server.io_newjobs),
6631 (unsigned long) listLength(server.io_processing),
6632 (unsigned long) listLength(server.io_processed),
6633 (unsigned long) server.io_active_threads,
6634 (unsigned long) server.vm_blocked_clients
6635 );
6636 unlockThreadedIO();
6637 }
6638 for (j = 0; j < server.dbnum; j++) {
6639 long long keys, vkeys;
6640
6641 keys = dictSize(server.db[j].dict);
6642 vkeys = dictSize(server.db[j].expires);
6643 if (keys || vkeys) {
6644 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6645 j, keys, vkeys);
6646 }
6647 }
6648 return info;
6649 }
6650
6651 static void infoCommand(redisClient *c) {
6652 sds info = genRedisInfoString();
6653 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6654 (unsigned long)sdslen(info)));
6655 addReplySds(c,info);
6656 addReply(c,shared.crlf);
6657 }
6658
6659 static void monitorCommand(redisClient *c) {
6660 /* ignore MONITOR if aleady slave or in monitor mode */
6661 if (c->flags & REDIS_SLAVE) return;
6662
6663 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6664 c->slaveseldb = 0;
6665 listAddNodeTail(server.monitors,c);
6666 addReply(c,shared.ok);
6667 }
6668
6669 /* ================================= Expire ================================= */
6670 static int removeExpire(redisDb *db, robj *key) {
6671 if (dictDelete(db->expires,key) == DICT_OK) {
6672 return 1;
6673 } else {
6674 return 0;
6675 }
6676 }
6677
6678 static int setExpire(redisDb *db, robj *key, time_t when) {
6679 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6680 return 0;
6681 } else {
6682 incrRefCount(key);
6683 return 1;
6684 }
6685 }
6686
6687 /* Return the expire time of the specified key, or -1 if no expire
6688 * is associated with this key (i.e. the key is non volatile) */
6689 static time_t getExpire(redisDb *db, robj *key) {
6690 dictEntry *de;
6691
6692 /* No expire? return ASAP */
6693 if (dictSize(db->expires) == 0 ||
6694 (de = dictFind(db->expires,key)) == NULL) return -1;
6695
6696 return (time_t) dictGetEntryVal(de);
6697 }
6698
6699 static int expireIfNeeded(redisDb *db, robj *key) {
6700 time_t when;
6701 dictEntry *de;
6702
6703 /* No expire? return ASAP */
6704 if (dictSize(db->expires) == 0 ||
6705 (de = dictFind(db->expires,key)) == NULL) return 0;
6706
6707 /* Lookup the expire */
6708 when = (time_t) dictGetEntryVal(de);
6709 if (time(NULL) <= when) return 0;
6710
6711 /* Delete the key */
6712 dictDelete(db->expires,key);
6713 server.stat_expiredkeys++;
6714 return dictDelete(db->dict,key) == DICT_OK;
6715 }
6716
6717 static int deleteIfVolatile(redisDb *db, robj *key) {
6718 dictEntry *de;
6719
6720 /* No expire? return ASAP */
6721 if (dictSize(db->expires) == 0 ||
6722 (de = dictFind(db->expires,key)) == NULL) return 0;
6723
6724 /* Delete the key */
6725 server.dirty++;
6726 server.stat_expiredkeys++;
6727 dictDelete(db->expires,key);
6728 return dictDelete(db->dict,key) == DICT_OK;
6729 }
6730
6731 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6732 dictEntry *de;
6733
6734 de = dictFind(c->db->dict,key);
6735 if (de == NULL) {
6736 addReply(c,shared.czero);
6737 return;
6738 }
6739 if (seconds < 0) {
6740 if (deleteKey(c->db,key)) server.dirty++;
6741 addReply(c, shared.cone);
6742 return;
6743 } else {
6744 time_t when = time(NULL)+seconds;
6745 if (setExpire(c->db,key,when)) {
6746 addReply(c,shared.cone);
6747 server.dirty++;
6748 } else {
6749 addReply(c,shared.czero);
6750 }
6751 return;
6752 }
6753 }
6754
6755 static void expireCommand(redisClient *c) {
6756 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6757 }
6758
6759 static void expireatCommand(redisClient *c) {
6760 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6761 }
6762
6763 static void ttlCommand(redisClient *c) {
6764 time_t expire;
6765 int ttl = -1;
6766
6767 expire = getExpire(c->db,c->argv[1]);
6768 if (expire != -1) {
6769 ttl = (int) (expire-time(NULL));
6770 if (ttl < 0) ttl = -1;
6771 }
6772 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6773 }
6774
6775 /* ================================ MULTI/EXEC ============================== */
6776
6777 /* Client state initialization for MULTI/EXEC */
6778 static void initClientMultiState(redisClient *c) {
6779 c->mstate.commands = NULL;
6780 c->mstate.count = 0;
6781 }
6782
6783 /* Release all the resources associated with MULTI/EXEC state */
6784 static void freeClientMultiState(redisClient *c) {
6785 int j;
6786
6787 for (j = 0; j < c->mstate.count; j++) {
6788 int i;
6789 multiCmd *mc = c->mstate.commands+j;
6790
6791 for (i = 0; i < mc->argc; i++)
6792 decrRefCount(mc->argv[i]);
6793 zfree(mc->argv);
6794 }
6795 zfree(c->mstate.commands);
6796 }
6797
6798 /* Add a new command into the MULTI commands queue */
6799 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6800 multiCmd *mc;
6801 int j;
6802
6803 c->mstate.commands = zrealloc(c->mstate.commands,
6804 sizeof(multiCmd)*(c->mstate.count+1));
6805 mc = c->mstate.commands+c->mstate.count;
6806 mc->cmd = cmd;
6807 mc->argc = c->argc;
6808 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6809 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6810 for (j = 0; j < c->argc; j++)
6811 incrRefCount(mc->argv[j]);
6812 c->mstate.count++;
6813 }
6814
6815 static void multiCommand(redisClient *c) {
6816 c->flags |= REDIS_MULTI;
6817 addReply(c,shared.ok);
6818 }
6819
6820 static void discardCommand(redisClient *c) {
6821 if (!(c->flags & REDIS_MULTI)) {
6822 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6823 return;
6824 }
6825
6826 freeClientMultiState(c);
6827 initClientMultiState(c);
6828 c->flags &= (~REDIS_MULTI);
6829 addReply(c,shared.ok);
6830 }
6831
6832 static void execCommand(redisClient *c) {
6833 int j;
6834 robj **orig_argv;
6835 int orig_argc;
6836
6837 if (!(c->flags & REDIS_MULTI)) {
6838 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6839 return;
6840 }
6841
6842 orig_argv = c->argv;
6843 orig_argc = c->argc;
6844 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6845 for (j = 0; j < c->mstate.count; j++) {
6846 c->argc = c->mstate.commands[j].argc;
6847 c->argv = c->mstate.commands[j].argv;
6848 call(c,c->mstate.commands[j].cmd);
6849 }
6850 c->argv = orig_argv;
6851 c->argc = orig_argc;
6852 freeClientMultiState(c);
6853 initClientMultiState(c);
6854 c->flags &= (~REDIS_MULTI);
6855 }
6856
6857 /* =========================== Blocking Operations ========================= */
6858
6859 /* Currently Redis blocking operations support is limited to list POP ops,
6860 * so the current implementation is not fully generic, but it is also not
6861 * completely specific so it will not require a rewrite to support new
6862 * kind of blocking operations in the future.
6863 *
6864 * Still it's important to note that list blocking operations can be already
6865 * used as a notification mechanism in order to implement other blocking
6866 * operations at application level, so there must be a very strong evidence
6867 * of usefulness and generality before new blocking operations are implemented.
6868 *
6869 * This is how the current blocking POP works, we use BLPOP as example:
6870 * - If the user calls BLPOP and the key exists and contains a non empty list
6871 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6872 * if there is not to block.
6873 * - If instead BLPOP is called and the key does not exists or the list is
6874 * empty we need to block. In order to do so we remove the notification for
6875 * new data to read in the client socket (so that we'll not serve new
6876 * requests if the blocking request is not served). Also we put the client
6877 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6878 * blocking for this keys.
6879 * - If a PUSH operation against a key with blocked clients waiting is
6880 * performed, we serve the first in the list: basically instead to push
6881 * the new element inside the list we return it to the (first / oldest)
6882 * blocking client, unblock the client, and remove it form the list.
6883 *
6884 * The above comment and the source code should be enough in order to understand
6885 * the implementation and modify / fix it later.
6886 */
6887
6888 /* Set a client in blocking mode for the specified key, with the specified
6889 * timeout */
6890 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6891 dictEntry *de;
6892 list *l;
6893 int j;
6894
6895 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6896 c->blockingkeysnum = numkeys;
6897 c->blockingto = timeout;
6898 for (j = 0; j < numkeys; j++) {
6899 /* Add the key in the client structure, to map clients -> keys */
6900 c->blockingkeys[j] = keys[j];
6901 incrRefCount(keys[j]);
6902
6903 /* And in the other "side", to map keys -> clients */
6904 de = dictFind(c->db->blockingkeys,keys[j]);
6905 if (de == NULL) {
6906 int retval;
6907
6908 /* For every key we take a list of clients blocked for it */
6909 l = listCreate();
6910 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6911 incrRefCount(keys[j]);
6912 assert(retval == DICT_OK);
6913 } else {
6914 l = dictGetEntryVal(de);
6915 }
6916 listAddNodeTail(l,c);
6917 }
6918 /* Mark the client as a blocked client */
6919 c->flags |= REDIS_BLOCKED;
6920 server.blpop_blocked_clients++;
6921 }
6922
6923 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6924 static void unblockClientWaitingData(redisClient *c) {
6925 dictEntry *de;
6926 list *l;
6927 int j;
6928
6929 assert(c->blockingkeys != NULL);
6930 /* The client may wait for multiple keys, so unblock it for every key. */
6931 for (j = 0; j < c->blockingkeysnum; j++) {
6932 /* Remove this client from the list of clients waiting for this key. */
6933 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6934 assert(de != NULL);
6935 l = dictGetEntryVal(de);
6936 listDelNode(l,listSearchKey(l,c));
6937 /* If the list is empty we need to remove it to avoid wasting memory */
6938 if (listLength(l) == 0)
6939 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6940 decrRefCount(c->blockingkeys[j]);
6941 }
6942 /* Cleanup the client structure */
6943 zfree(c->blockingkeys);
6944 c->blockingkeys = NULL;
6945 c->flags &= (~REDIS_BLOCKED);
6946 server.blpop_blocked_clients--;
6947 /* We want to process data if there is some command waiting
6948 * in the input buffer. Note that this is safe even if
6949 * unblockClientWaitingData() gets called from freeClient() because
6950 * freeClient() will be smart enough to call this function
6951 * *after* c->querybuf was set to NULL. */
6952 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6953 }
6954
6955 /* This should be called from any function PUSHing into lists.
6956 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6957 * 'ele' is the element pushed.
6958 *
6959 * If the function returns 0 there was no client waiting for a list push
6960 * against this key.
6961 *
6962 * If the function returns 1 there was a client waiting for a list push
6963 * against this key, the element was passed to this client thus it's not
6964 * needed to actually add it to the list and the caller should return asap. */
6965 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6966 struct dictEntry *de;
6967 redisClient *receiver;
6968 list *l;
6969 listNode *ln;
6970
6971 de = dictFind(c->db->blockingkeys,key);
6972 if (de == NULL) return 0;
6973 l = dictGetEntryVal(de);
6974 ln = listFirst(l);
6975 assert(ln != NULL);
6976 receiver = ln->value;
6977
6978 addReplySds(receiver,sdsnew("*2\r\n"));
6979 addReplyBulk(receiver,key);
6980 addReplyBulk(receiver,ele);
6981 unblockClientWaitingData(receiver);
6982 return 1;
6983 }
6984
6985 /* Blocking RPOP/LPOP */
6986 static void blockingPopGenericCommand(redisClient *c, int where) {
6987 robj *o;
6988 time_t timeout;
6989 int j;
6990
6991 for (j = 1; j < c->argc-1; j++) {
6992 o = lookupKeyWrite(c->db,c->argv[j]);
6993 if (o != NULL) {
6994 if (o->type != REDIS_LIST) {
6995 addReply(c,shared.wrongtypeerr);
6996 return;
6997 } else {
6998 list *list = o->ptr;
6999 if (listLength(list) != 0) {
7000 /* If the list contains elements fall back to the usual
7001 * non-blocking POP operation */
7002 robj *argv[2], **orig_argv;
7003 int orig_argc;
7004
7005 /* We need to alter the command arguments before to call
7006 * popGenericCommand() as the command takes a single key. */
7007 orig_argv = c->argv;
7008 orig_argc = c->argc;
7009 argv[1] = c->argv[j];
7010 c->argv = argv;
7011 c->argc = 2;
7012
7013 /* Also the return value is different, we need to output
7014 * the multi bulk reply header and the key name. The
7015 * "real" command will add the last element (the value)
7016 * for us. If this souds like an hack to you it's just
7017 * because it is... */
7018 addReplySds(c,sdsnew("*2\r\n"));
7019 addReplyBulk(c,argv[1]);
7020 popGenericCommand(c,where);
7021
7022 /* Fix the client structure with the original stuff */
7023 c->argv = orig_argv;
7024 c->argc = orig_argc;
7025 return;
7026 }
7027 }
7028 }
7029 }
7030 /* If the list is empty or the key does not exists we must block */
7031 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7032 if (timeout > 0) timeout += time(NULL);
7033 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7034 }
7035
7036 static void blpopCommand(redisClient *c) {
7037 blockingPopGenericCommand(c,REDIS_HEAD);
7038 }
7039
7040 static void brpopCommand(redisClient *c) {
7041 blockingPopGenericCommand(c,REDIS_TAIL);
7042 }
7043
7044 /* =============================== Replication ============================= */
7045
7046 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7047 ssize_t nwritten, ret = size;
7048 time_t start = time(NULL);
7049
7050 timeout++;
7051 while(size) {
7052 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7053 nwritten = write(fd,ptr,size);
7054 if (nwritten == -1) return -1;
7055 ptr += nwritten;
7056 size -= nwritten;
7057 }
7058 if ((time(NULL)-start) > timeout) {
7059 errno = ETIMEDOUT;
7060 return -1;
7061 }
7062 }
7063 return ret;
7064 }
7065
7066 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7067 ssize_t nread, totread = 0;
7068 time_t start = time(NULL);
7069
7070 timeout++;
7071 while(size) {
7072 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7073 nread = read(fd,ptr,size);
7074 if (nread == -1) return -1;
7075 ptr += nread;
7076 size -= nread;
7077 totread += nread;
7078 }
7079 if ((time(NULL)-start) > timeout) {
7080 errno = ETIMEDOUT;
7081 return -1;
7082 }
7083 }
7084 return totread;
7085 }
7086
7087 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7088 ssize_t nread = 0;
7089
7090 size--;
7091 while(size) {
7092 char c;
7093
7094 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7095 if (c == '\n') {
7096 *ptr = '\0';
7097 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7098 return nread;
7099 } else {
7100 *ptr++ = c;
7101 *ptr = '\0';
7102 nread++;
7103 }
7104 }
7105 return nread;
7106 }
7107
7108 static void syncCommand(redisClient *c) {
7109 /* ignore SYNC if aleady slave or in monitor mode */
7110 if (c->flags & REDIS_SLAVE) return;
7111
7112 /* SYNC can't be issued when the server has pending data to send to
7113 * the client about already issued commands. We need a fresh reply
7114 * buffer registering the differences between the BGSAVE and the current
7115 * dataset, so that we can copy to other slaves if needed. */
7116 if (listLength(c->reply) != 0) {
7117 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7118 return;
7119 }
7120
7121 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7122 /* Here we need to check if there is a background saving operation
7123 * in progress, or if it is required to start one */
7124 if (server.bgsavechildpid != -1) {
7125 /* Ok a background save is in progress. Let's check if it is a good
7126 * one for replication, i.e. if there is another slave that is
7127 * registering differences since the server forked to save */
7128 redisClient *slave;
7129 listNode *ln;
7130 listIter li;
7131
7132 listRewind(server.slaves,&li);
7133 while((ln = listNext(&li))) {
7134 slave = ln->value;
7135 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7136 }
7137 if (ln) {
7138 /* Perfect, the server is already registering differences for
7139 * another slave. Set the right state, and copy the buffer. */
7140 listRelease(c->reply);
7141 c->reply = listDup(slave->reply);
7142 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7143 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7144 } else {
7145 /* No way, we need to wait for the next BGSAVE in order to
7146 * register differences */
7147 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7148 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7149 }
7150 } else {
7151 /* Ok we don't have a BGSAVE in progress, let's start one */
7152 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7153 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7154 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7155 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7156 return;
7157 }
7158 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7159 }
7160 c->repldbfd = -1;
7161 c->flags |= REDIS_SLAVE;
7162 c->slaveseldb = 0;
7163 listAddNodeTail(server.slaves,c);
7164 return;
7165 }
7166
7167 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7168 redisClient *slave = privdata;
7169 REDIS_NOTUSED(el);
7170 REDIS_NOTUSED(mask);
7171 char buf[REDIS_IOBUF_LEN];
7172 ssize_t nwritten, buflen;
7173
7174 if (slave->repldboff == 0) {
7175 /* Write the bulk write count before to transfer the DB. In theory here
7176 * we don't know how much room there is in the output buffer of the
7177 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7178 * operations) will never be smaller than the few bytes we need. */
7179 sds bulkcount;
7180
7181 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7182 slave->repldbsize);
7183 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7184 {
7185 sdsfree(bulkcount);
7186 freeClient(slave);
7187 return;
7188 }
7189 sdsfree(bulkcount);
7190 }
7191 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7192 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7193 if (buflen <= 0) {
7194 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7195 (buflen == 0) ? "premature EOF" : strerror(errno));
7196 freeClient(slave);
7197 return;
7198 }
7199 if ((nwritten = write(fd,buf,buflen)) == -1) {
7200 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7201 strerror(errno));
7202 freeClient(slave);
7203 return;
7204 }
7205 slave->repldboff += nwritten;
7206 if (slave->repldboff == slave->repldbsize) {
7207 close(slave->repldbfd);
7208 slave->repldbfd = -1;
7209 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7210 slave->replstate = REDIS_REPL_ONLINE;
7211 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7212 sendReplyToClient, slave) == AE_ERR) {
7213 freeClient(slave);
7214 return;
7215 }
7216 addReplySds(slave,sdsempty());
7217 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7218 }
7219 }
7220
7221 /* This function is called at the end of every backgrond saving.
7222 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7223 * otherwise REDIS_ERR is passed to the function.
7224 *
7225 * The goal of this function is to handle slaves waiting for a successful
7226 * background saving in order to perform non-blocking synchronization. */
7227 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7228 listNode *ln;
7229 int startbgsave = 0;
7230 listIter li;
7231
7232 listRewind(server.slaves,&li);
7233 while((ln = listNext(&li))) {
7234 redisClient *slave = ln->value;
7235
7236 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7237 startbgsave = 1;
7238 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7239 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7240 struct redis_stat buf;
7241
7242 if (bgsaveerr != REDIS_OK) {
7243 freeClient(slave);
7244 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7245 continue;
7246 }
7247 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7248 redis_fstat(slave->repldbfd,&buf) == -1) {
7249 freeClient(slave);
7250 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7251 continue;
7252 }
7253 slave->repldboff = 0;
7254 slave->repldbsize = buf.st_size;
7255 slave->replstate = REDIS_REPL_SEND_BULK;
7256 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7257 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7258 freeClient(slave);
7259 continue;
7260 }
7261 }
7262 }
7263 if (startbgsave) {
7264 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7265 listIter li;
7266
7267 listRewind(server.slaves,&li);
7268 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7269 while((ln = listNext(&li))) {
7270 redisClient *slave = ln->value;
7271
7272 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7273 freeClient(slave);
7274 }
7275 }
7276 }
7277 }
7278
7279 static int syncWithMaster(void) {
7280 char buf[1024], tmpfile[256], authcmd[1024];
7281 long dumpsize;
7282 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7283 int dfd, maxtries = 5;
7284
7285 if (fd == -1) {
7286 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7287 strerror(errno));
7288 return REDIS_ERR;
7289 }
7290
7291 /* AUTH with the master if required. */
7292 if(server.masterauth) {
7293 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7294 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7295 close(fd);
7296 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7297 strerror(errno));
7298 return REDIS_ERR;
7299 }
7300 /* Read the AUTH result. */
7301 if (syncReadLine(fd,buf,1024,3600) == -1) {
7302 close(fd);
7303 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7304 strerror(errno));
7305 return REDIS_ERR;
7306 }
7307 if (buf[0] != '+') {
7308 close(fd);
7309 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7310 return REDIS_ERR;
7311 }
7312 }
7313
7314 /* Issue the SYNC command */
7315 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7316 close(fd);
7317 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7318 strerror(errno));
7319 return REDIS_ERR;
7320 }
7321 /* Read the bulk write count */
7322 if (syncReadLine(fd,buf,1024,3600) == -1) {
7323 close(fd);
7324 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7325 strerror(errno));
7326 return REDIS_ERR;
7327 }
7328 if (buf[0] != '$') {
7329 close(fd);
7330 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7331 return REDIS_ERR;
7332 }
7333 dumpsize = strtol(buf+1,NULL,10);
7334 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7335 /* Read the bulk write data on a temp file */
7336 while(maxtries--) {
7337 snprintf(tmpfile,256,
7338 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7339 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7340 if (dfd != -1) break;
7341 sleep(1);
7342 }
7343 if (dfd == -1) {
7344 close(fd);
7345 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7346 return REDIS_ERR;
7347 }
7348 while(dumpsize) {
7349 int nread, nwritten;
7350
7351 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7352 if (nread == -1) {
7353 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7354 strerror(errno));
7355 close(fd);
7356 close(dfd);
7357 return REDIS_ERR;
7358 }
7359 nwritten = write(dfd,buf,nread);
7360 if (nwritten == -1) {
7361 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7362 close(fd);
7363 close(dfd);
7364 return REDIS_ERR;
7365 }
7366 dumpsize -= nread;
7367 }
7368 close(dfd);
7369 if (rename(tmpfile,server.dbfilename) == -1) {
7370 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7371 unlink(tmpfile);
7372 close(fd);
7373 return REDIS_ERR;
7374 }
7375 emptyDb();
7376 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7377 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7378 close(fd);
7379 return REDIS_ERR;
7380 }
7381 server.master = createClient(fd);
7382 server.master->flags |= REDIS_MASTER;
7383 server.master->authenticated = 1;
7384 server.replstate = REDIS_REPL_CONNECTED;
7385 return REDIS_OK;
7386 }
7387
7388 static void slaveofCommand(redisClient *c) {
7389 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7390 !strcasecmp(c->argv[2]->ptr,"one")) {
7391 if (server.masterhost) {
7392 sdsfree(server.masterhost);
7393 server.masterhost = NULL;
7394 if (server.master) freeClient(server.master);
7395 server.replstate = REDIS_REPL_NONE;
7396 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7397 }
7398 } else {
7399 sdsfree(server.masterhost);
7400 server.masterhost = sdsdup(c->argv[1]->ptr);
7401 server.masterport = atoi(c->argv[2]->ptr);
7402 if (server.master) freeClient(server.master);
7403 server.replstate = REDIS_REPL_CONNECT;
7404 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7405 server.masterhost, server.masterport);
7406 }
7407 addReply(c,shared.ok);
7408 }
7409
7410 /* ============================ Maxmemory directive ======================== */
7411
7412 /* Try to free one object form the pre-allocated objects free list.
7413 * This is useful under low mem conditions as by default we take 1 million
7414 * free objects allocated. On success REDIS_OK is returned, otherwise
7415 * REDIS_ERR. */
7416 static int tryFreeOneObjectFromFreelist(void) {
7417 robj *o;
7418
7419 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7420 if (listLength(server.objfreelist)) {
7421 listNode *head = listFirst(server.objfreelist);
7422 o = listNodeValue(head);
7423 listDelNode(server.objfreelist,head);
7424 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7425 zfree(o);
7426 return REDIS_OK;
7427 } else {
7428 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7429 return REDIS_ERR;
7430 }
7431 }
7432
7433 /* This function gets called when 'maxmemory' is set on the config file to limit
7434 * the max memory used by the server, and we are out of memory.
7435 * This function will try to, in order:
7436 *
7437 * - Free objects from the free list
7438 * - Try to remove keys with an EXPIRE set
7439 *
7440 * It is not possible to free enough memory to reach used-memory < maxmemory
7441 * the server will start refusing commands that will enlarge even more the
7442 * memory usage.
7443 */
7444 static void freeMemoryIfNeeded(void) {
7445 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7446 int j, k, freed = 0;
7447
7448 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7449 for (j = 0; j < server.dbnum; j++) {
7450 int minttl = -1;
7451 robj *minkey = NULL;
7452 struct dictEntry *de;
7453
7454 if (dictSize(server.db[j].expires)) {
7455 freed = 1;
7456 /* From a sample of three keys drop the one nearest to
7457 * the natural expire */
7458 for (k = 0; k < 3; k++) {
7459 time_t t;
7460
7461 de = dictGetRandomKey(server.db[j].expires);
7462 t = (time_t) dictGetEntryVal(de);
7463 if (minttl == -1 || t < minttl) {
7464 minkey = dictGetEntryKey(de);
7465 minttl = t;
7466 }
7467 }
7468 deleteKey(server.db+j,minkey);
7469 }
7470 }
7471 if (!freed) return; /* nothing to free... */
7472 }
7473 }
7474
7475 /* ============================== Append Only file ========================== */
7476
7477 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7478 sds buf = sdsempty();
7479 int j;
7480 ssize_t nwritten;
7481 time_t now;
7482 robj *tmpargv[3];
7483
7484 /* The DB this command was targetting is not the same as the last command
7485 * we appendend. To issue a SELECT command is needed. */
7486 if (dictid != server.appendseldb) {
7487 char seldb[64];
7488
7489 snprintf(seldb,sizeof(seldb),"%d",dictid);
7490 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7491 (unsigned long)strlen(seldb),seldb);
7492 server.appendseldb = dictid;
7493 }
7494
7495 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7496 * EXPIREs into EXPIREATs calls */
7497 if (cmd->proc == expireCommand) {
7498 long when;
7499
7500 tmpargv[0] = createStringObject("EXPIREAT",8);
7501 tmpargv[1] = argv[1];
7502 incrRefCount(argv[1]);
7503 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7504 tmpargv[2] = createObject(REDIS_STRING,
7505 sdscatprintf(sdsempty(),"%ld",when));
7506 argv = tmpargv;
7507 }
7508
7509 /* Append the actual command */
7510 buf = sdscatprintf(buf,"*%d\r\n",argc);
7511 for (j = 0; j < argc; j++) {
7512 robj *o = argv[j];
7513
7514 o = getDecodedObject(o);
7515 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7516 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7517 buf = sdscatlen(buf,"\r\n",2);
7518 decrRefCount(o);
7519 }
7520
7521 /* Free the objects from the modified argv for EXPIREAT */
7522 if (cmd->proc == expireCommand) {
7523 for (j = 0; j < 3; j++)
7524 decrRefCount(argv[j]);
7525 }
7526
7527 /* We want to perform a single write. This should be guaranteed atomic
7528 * at least if the filesystem we are writing is a real physical one.
7529 * While this will save us against the server being killed I don't think
7530 * there is much to do about the whole server stopping for power problems
7531 * or alike */
7532 nwritten = write(server.appendfd,buf,sdslen(buf));
7533 if (nwritten != (signed)sdslen(buf)) {
7534 /* Ooops, we are in troubles. The best thing to do for now is
7535 * to simply exit instead to give the illusion that everything is
7536 * working as expected. */
7537 if (nwritten == -1) {
7538 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7539 } else {
7540 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7541 }
7542 exit(1);
7543 }
7544 /* If a background append only file rewriting is in progress we want to
7545 * accumulate the differences between the child DB and the current one
7546 * in a buffer, so that when the child process will do its work we
7547 * can append the differences to the new append only file. */
7548 if (server.bgrewritechildpid != -1)
7549 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7550
7551 sdsfree(buf);
7552 now = time(NULL);
7553 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7554 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7555 now-server.lastfsync > 1))
7556 {
7557 fsync(server.appendfd); /* Let's try to get this data on the disk */
7558 server.lastfsync = now;
7559 }
7560 }
7561
7562 /* In Redis commands are always executed in the context of a client, so in
7563 * order to load the append only file we need to create a fake client. */
7564 static struct redisClient *createFakeClient(void) {
7565 struct redisClient *c = zmalloc(sizeof(*c));
7566
7567 selectDb(c,0);
7568 c->fd = -1;
7569 c->querybuf = sdsempty();
7570 c->argc = 0;
7571 c->argv = NULL;
7572 c->flags = 0;
7573 /* We set the fake client as a slave waiting for the synchronization
7574 * so that Redis will not try to send replies to this client. */
7575 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7576 c->reply = listCreate();
7577 listSetFreeMethod(c->reply,decrRefCount);
7578 listSetDupMethod(c->reply,dupClientReplyValue);
7579 return c;
7580 }
7581
7582 static void freeFakeClient(struct redisClient *c) {
7583 sdsfree(c->querybuf);
7584 listRelease(c->reply);
7585 zfree(c);
7586 }
7587
7588 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7589 * error (the append only file is zero-length) REDIS_ERR is returned. On
7590 * fatal error an error message is logged and the program exists. */
7591 int loadAppendOnlyFile(char *filename) {
7592 struct redisClient *fakeClient;
7593 FILE *fp = fopen(filename,"r");
7594 struct redis_stat sb;
7595 unsigned long long loadedkeys = 0;
7596
7597 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7598 return REDIS_ERR;
7599
7600 if (fp == NULL) {
7601 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7602 exit(1);
7603 }
7604
7605 fakeClient = createFakeClient();
7606 while(1) {
7607 int argc, j;
7608 unsigned long len;
7609 robj **argv;
7610 char buf[128];
7611 sds argsds;
7612 struct redisCommand *cmd;
7613
7614 if (fgets(buf,sizeof(buf),fp) == NULL) {
7615 if (feof(fp))
7616 break;
7617 else
7618 goto readerr;
7619 }
7620 if (buf[0] != '*') goto fmterr;
7621 argc = atoi(buf+1);
7622 argv = zmalloc(sizeof(robj*)*argc);
7623 for (j = 0; j < argc; j++) {
7624 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7625 if (buf[0] != '$') goto fmterr;
7626 len = strtol(buf+1,NULL,10);
7627 argsds = sdsnewlen(NULL,len);
7628 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7629 argv[j] = createObject(REDIS_STRING,argsds);
7630 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7631 }
7632
7633 /* Command lookup */
7634 cmd = lookupCommand(argv[0]->ptr);
7635 if (!cmd) {
7636 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7637 exit(1);
7638 }
7639 /* Try object sharing and encoding */
7640 if (server.shareobjects) {
7641 int j;
7642 for(j = 1; j < argc; j++)
7643 argv[j] = tryObjectSharing(argv[j]);
7644 }
7645 if (cmd->flags & REDIS_CMD_BULK)
7646 tryObjectEncoding(argv[argc-1]);
7647 /* Run the command in the context of a fake client */
7648 fakeClient->argc = argc;
7649 fakeClient->argv = argv;
7650 cmd->proc(fakeClient);
7651 /* Discard the reply objects list from the fake client */
7652 while(listLength(fakeClient->reply))
7653 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7654 /* Clean up, ready for the next command */
7655 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7656 zfree(argv);
7657 /* Handle swapping while loading big datasets when VM is on */
7658 loadedkeys++;
7659 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7660 while (zmalloc_used_memory() > server.vm_max_memory) {
7661 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7662 }
7663 }
7664 }
7665 fclose(fp);
7666 freeFakeClient(fakeClient);
7667 return REDIS_OK;
7668
7669 readerr:
7670 if (feof(fp)) {
7671 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7672 } else {
7673 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7674 }
7675 exit(1);
7676 fmterr:
7677 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7678 exit(1);
7679 }
7680
7681 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7682 static int fwriteBulkObject(FILE *fp, robj *obj) {
7683 char buf[128];
7684 int decrrc = 0;
7685
7686 /* Avoid the incr/decr ref count business if possible to help
7687 * copy-on-write (we are often in a child process when this function
7688 * is called).
7689 * Also makes sure that key objects don't get incrRefCount-ed when VM
7690 * is enabled */
7691 if (obj->encoding != REDIS_ENCODING_RAW) {
7692 obj = getDecodedObject(obj);
7693 decrrc = 1;
7694 }
7695 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7696 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7697 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7698 goto err;
7699 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7700 if (decrrc) decrRefCount(obj);
7701 return 1;
7702 err:
7703 if (decrrc) decrRefCount(obj);
7704 return 0;
7705 }
7706
7707 /* Write binary-safe string into a file in the bulkformat
7708 * $<count>\r\n<payload>\r\n */
7709 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7710 char buf[128];
7711
7712 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7713 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7714 if (len && fwrite(s,len,1,fp) == 0) return 0;
7715 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7716 return 1;
7717 }
7718
7719 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7720 static int fwriteBulkDouble(FILE *fp, double d) {
7721 char buf[128], dbuf[128];
7722
7723 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7724 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7725 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7726 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7727 return 1;
7728 }
7729
7730 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7731 static int fwriteBulkLong(FILE *fp, long l) {
7732 char buf[128], lbuf[128];
7733
7734 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7735 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7736 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7737 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7738 return 1;
7739 }
7740
7741 /* Write a sequence of commands able to fully rebuild the dataset into
7742 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7743 static int rewriteAppendOnlyFile(char *filename) {
7744 dictIterator *di = NULL;
7745 dictEntry *de;
7746 FILE *fp;
7747 char tmpfile[256];
7748 int j;
7749 time_t now = time(NULL);
7750
7751 /* Note that we have to use a different temp name here compared to the
7752 * one used by rewriteAppendOnlyFileBackground() function. */
7753 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7754 fp = fopen(tmpfile,"w");
7755 if (!fp) {
7756 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7757 return REDIS_ERR;
7758 }
7759 for (j = 0; j < server.dbnum; j++) {
7760 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7761 redisDb *db = server.db+j;
7762 dict *d = db->dict;
7763 if (dictSize(d) == 0) continue;
7764 di = dictGetIterator(d);
7765 if (!di) {
7766 fclose(fp);
7767 return REDIS_ERR;
7768 }
7769
7770 /* SELECT the new DB */
7771 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7772 if (fwriteBulkLong(fp,j) == 0) goto werr;
7773
7774 /* Iterate this DB writing every entry */
7775 while((de = dictNext(di)) != NULL) {
7776 robj *key, *o;
7777 time_t expiretime;
7778 int swapped;
7779
7780 key = dictGetEntryKey(de);
7781 /* If the value for this key is swapped, load a preview in memory.
7782 * We use a "swapped" flag to remember if we need to free the
7783 * value object instead to just increment the ref count anyway
7784 * in order to avoid copy-on-write of pages if we are forked() */
7785 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7786 key->storage == REDIS_VM_SWAPPING) {
7787 o = dictGetEntryVal(de);
7788 swapped = 0;
7789 } else {
7790 o = vmPreviewObject(key);
7791 swapped = 1;
7792 }
7793 expiretime = getExpire(db,key);
7794
7795 /* Save the key and associated value */
7796 if (o->type == REDIS_STRING) {
7797 /* Emit a SET command */
7798 char cmd[]="*3\r\n$3\r\nSET\r\n";
7799 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7800 /* Key and value */
7801 if (fwriteBulkObject(fp,key) == 0) goto werr;
7802 if (fwriteBulkObject(fp,o) == 0) goto werr;
7803 } else if (o->type == REDIS_LIST) {
7804 /* Emit the RPUSHes needed to rebuild the list */
7805 list *list = o->ptr;
7806 listNode *ln;
7807 listIter li;
7808
7809 listRewind(list,&li);
7810 while((ln = listNext(&li))) {
7811 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7812 robj *eleobj = listNodeValue(ln);
7813
7814 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7815 if (fwriteBulkObject(fp,key) == 0) goto werr;
7816 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7817 }
7818 } else if (o->type == REDIS_SET) {
7819 /* Emit the SADDs needed to rebuild the set */
7820 dict *set = o->ptr;
7821 dictIterator *di = dictGetIterator(set);
7822 dictEntry *de;
7823
7824 while((de = dictNext(di)) != NULL) {
7825 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7826 robj *eleobj = dictGetEntryKey(de);
7827
7828 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7829 if (fwriteBulkObject(fp,key) == 0) goto werr;
7830 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7831 }
7832 dictReleaseIterator(di);
7833 } else if (o->type == REDIS_ZSET) {
7834 /* Emit the ZADDs needed to rebuild the sorted set */
7835 zset *zs = o->ptr;
7836 dictIterator *di = dictGetIterator(zs->dict);
7837 dictEntry *de;
7838
7839 while((de = dictNext(di)) != NULL) {
7840 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7841 robj *eleobj = dictGetEntryKey(de);
7842 double *score = dictGetEntryVal(de);
7843
7844 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7845 if (fwriteBulkObject(fp,key) == 0) goto werr;
7846 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7847 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7848 }
7849 dictReleaseIterator(di);
7850 } else if (o->type == REDIS_HASH) {
7851 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7852
7853 /* Emit the HSETs needed to rebuild the hash */
7854 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7855 unsigned char *p = zipmapRewind(o->ptr);
7856 unsigned char *field, *val;
7857 unsigned int flen, vlen;
7858
7859 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7860 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7861 if (fwriteBulkObject(fp,key) == 0) goto werr;
7862 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7863 return -1;
7864 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7865 return -1;
7866 }
7867 } else {
7868 dictIterator *di = dictGetIterator(o->ptr);
7869 dictEntry *de;
7870
7871 while((de = dictNext(di)) != NULL) {
7872 robj *field = dictGetEntryKey(de);
7873 robj *val = dictGetEntryVal(de);
7874
7875 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7876 if (fwriteBulkObject(fp,key) == 0) goto werr;
7877 if (fwriteBulkObject(fp,field) == -1) return -1;
7878 if (fwriteBulkObject(fp,val) == -1) return -1;
7879 }
7880 dictReleaseIterator(di);
7881 }
7882 } else {
7883 redisAssert(0);
7884 }
7885 /* Save the expire time */
7886 if (expiretime != -1) {
7887 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7888 /* If this key is already expired skip it */
7889 if (expiretime < now) continue;
7890 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7891 if (fwriteBulkObject(fp,key) == 0) goto werr;
7892 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7893 }
7894 if (swapped) decrRefCount(o);
7895 }
7896 dictReleaseIterator(di);
7897 }
7898
7899 /* Make sure data will not remain on the OS's output buffers */
7900 fflush(fp);
7901 fsync(fileno(fp));
7902 fclose(fp);
7903
7904 /* Use RENAME to make sure the DB file is changed atomically only
7905 * if the generate DB file is ok. */
7906 if (rename(tmpfile,filename) == -1) {
7907 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7908 unlink(tmpfile);
7909 return REDIS_ERR;
7910 }
7911 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7912 return REDIS_OK;
7913
7914 werr:
7915 fclose(fp);
7916 unlink(tmpfile);
7917 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7918 if (di) dictReleaseIterator(di);
7919 return REDIS_ERR;
7920 }
7921
7922 /* This is how rewriting of the append only file in background works:
7923 *
7924 * 1) The user calls BGREWRITEAOF
7925 * 2) Redis calls this function, that forks():
7926 * 2a) the child rewrite the append only file in a temp file.
7927 * 2b) the parent accumulates differences in server.bgrewritebuf.
7928 * 3) When the child finished '2a' exists.
7929 * 4) The parent will trap the exit code, if it's OK, will append the
7930 * data accumulated into server.bgrewritebuf into the temp file, and
7931 * finally will rename(2) the temp file in the actual file name.
7932 * The the new file is reopened as the new append only file. Profit!
7933 */
7934 static int rewriteAppendOnlyFileBackground(void) {
7935 pid_t childpid;
7936
7937 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7938 if (server.vm_enabled) waitEmptyIOJobsQueue();
7939 if ((childpid = fork()) == 0) {
7940 /* Child */
7941 char tmpfile[256];
7942
7943 if (server.vm_enabled) vmReopenSwapFile();
7944 close(server.fd);
7945 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7946 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7947 _exit(0);
7948 } else {
7949 _exit(1);
7950 }
7951 } else {
7952 /* Parent */
7953 if (childpid == -1) {
7954 redisLog(REDIS_WARNING,
7955 "Can't rewrite append only file in background: fork: %s",
7956 strerror(errno));
7957 return REDIS_ERR;
7958 }
7959 redisLog(REDIS_NOTICE,
7960 "Background append only file rewriting started by pid %d",childpid);
7961 server.bgrewritechildpid = childpid;
7962 /* We set appendseldb to -1 in order to force the next call to the
7963 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7964 * accumulated by the parent into server.bgrewritebuf will start
7965 * with a SELECT statement and it will be safe to merge. */
7966 server.appendseldb = -1;
7967 return REDIS_OK;
7968 }
7969 return REDIS_OK; /* unreached */
7970 }
7971
7972 static void bgrewriteaofCommand(redisClient *c) {
7973 if (server.bgrewritechildpid != -1) {
7974 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7975 return;
7976 }
7977 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7978 char *status = "+Background append only file rewriting started\r\n";
7979 addReplySds(c,sdsnew(status));
7980 } else {
7981 addReply(c,shared.err);
7982 }
7983 }
7984
7985 static void aofRemoveTempFile(pid_t childpid) {
7986 char tmpfile[256];
7987
7988 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7989 unlink(tmpfile);
7990 }
7991
7992 /* Virtual Memory is composed mainly of two subsystems:
7993 * - Blocking Virutal Memory
7994 * - Threaded Virtual Memory I/O
7995 * The two parts are not fully decoupled, but functions are split among two
7996 * different sections of the source code (delimited by comments) in order to
7997 * make more clear what functionality is about the blocking VM and what about
7998 * the threaded (not blocking) VM.
7999 *
8000 * Redis VM design:
8001 *
8002 * Redis VM is a blocking VM (one that blocks reading swapped values from
8003 * disk into memory when a value swapped out is needed in memory) that is made
8004 * unblocking by trying to examine the command argument vector in order to
8005 * load in background values that will likely be needed in order to exec
8006 * the command. The command is executed only once all the relevant keys
8007 * are loaded into memory.
8008 *
8009 * This basically is almost as simple of a blocking VM, but almost as parallel
8010 * as a fully non-blocking VM.
8011 */
8012
8013 /* =================== Virtual Memory - Blocking Side ====================== */
8014
8015 /* substitute the first occurrence of '%p' with the process pid in the
8016 * swap file name. */
8017 static void expandVmSwapFilename(void) {
8018 char *p = strstr(server.vm_swap_file,"%p");
8019 sds new;
8020
8021 if (!p) return;
8022 new = sdsempty();
8023 *p = '\0';
8024 new = sdscat(new,server.vm_swap_file);
8025 new = sdscatprintf(new,"%ld",(long) getpid());
8026 new = sdscat(new,p+2);
8027 zfree(server.vm_swap_file);
8028 server.vm_swap_file = new;
8029 }
8030
8031 static void vmInit(void) {
8032 off_t totsize;
8033 int pipefds[2];
8034 size_t stacksize;
8035
8036 if (server.vm_max_threads != 0)
8037 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8038
8039 expandVmSwapFilename();
8040 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8041 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8042 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8043 }
8044 if (server.vm_fp == NULL) {
8045 redisLog(REDIS_WARNING,
8046 "Impossible to open the swap file: %s. Exiting.",
8047 strerror(errno));
8048 exit(1);
8049 }
8050 server.vm_fd = fileno(server.vm_fp);
8051 server.vm_next_page = 0;
8052 server.vm_near_pages = 0;
8053 server.vm_stats_used_pages = 0;
8054 server.vm_stats_swapped_objects = 0;
8055 server.vm_stats_swapouts = 0;
8056 server.vm_stats_swapins = 0;
8057 totsize = server.vm_pages*server.vm_page_size;
8058 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8059 if (ftruncate(server.vm_fd,totsize) == -1) {
8060 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8061 strerror(errno));
8062 exit(1);
8063 } else {
8064 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8065 }
8066 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8067 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8068 (long long) (server.vm_pages+7)/8, server.vm_pages);
8069 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8070
8071 /* Initialize threaded I/O (used by Virtual Memory) */
8072 server.io_newjobs = listCreate();
8073 server.io_processing = listCreate();
8074 server.io_processed = listCreate();
8075 server.io_ready_clients = listCreate();
8076 pthread_mutex_init(&server.io_mutex,NULL);
8077 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8078 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8079 server.io_active_threads = 0;
8080 if (pipe(pipefds) == -1) {
8081 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8082 ,strerror(errno));
8083 exit(1);
8084 }
8085 server.io_ready_pipe_read = pipefds[0];
8086 server.io_ready_pipe_write = pipefds[1];
8087 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8088 /* LZF requires a lot of stack */
8089 pthread_attr_init(&server.io_threads_attr);
8090 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8091 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8092 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8093 /* Listen for events in the threaded I/O pipe */
8094 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8095 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8096 oom("creating file event");
8097 }
8098
8099 /* Mark the page as used */
8100 static void vmMarkPageUsed(off_t page) {
8101 off_t byte = page/8;
8102 int bit = page&7;
8103 redisAssert(vmFreePage(page) == 1);
8104 server.vm_bitmap[byte] |= 1<<bit;
8105 }
8106
8107 /* Mark N contiguous pages as used, with 'page' being the first. */
8108 static void vmMarkPagesUsed(off_t page, off_t count) {
8109 off_t j;
8110
8111 for (j = 0; j < count; j++)
8112 vmMarkPageUsed(page+j);
8113 server.vm_stats_used_pages += count;
8114 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8115 (long long)count, (long long)page);
8116 }
8117
8118 /* Mark the page as free */
8119 static void vmMarkPageFree(off_t page) {
8120 off_t byte = page/8;
8121 int bit = page&7;
8122 redisAssert(vmFreePage(page) == 0);
8123 server.vm_bitmap[byte] &= ~(1<<bit);
8124 }
8125
8126 /* Mark N contiguous pages as free, with 'page' being the first. */
8127 static void vmMarkPagesFree(off_t page, off_t count) {
8128 off_t j;
8129
8130 for (j = 0; j < count; j++)
8131 vmMarkPageFree(page+j);
8132 server.vm_stats_used_pages -= count;
8133 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8134 (long long)count, (long long)page);
8135 }
8136
8137 /* Test if the page is free */
8138 static int vmFreePage(off_t page) {
8139 off_t byte = page/8;
8140 int bit = page&7;
8141 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8142 }
8143
8144 /* Find N contiguous free pages storing the first page of the cluster in *first.
8145 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8146 * REDIS_ERR is returned.
8147 *
8148 * This function uses a simple algorithm: we try to allocate
8149 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8150 * again from the start of the swap file searching for free spaces.
8151 *
8152 * If it looks pretty clear that there are no free pages near our offset
8153 * we try to find less populated places doing a forward jump of
8154 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8155 * without hurry, and then we jump again and so forth...
8156 *
8157 * This function can be improved using a free list to avoid to guess
8158 * too much, since we could collect data about freed pages.
8159 *
8160 * note: I implemented this function just after watching an episode of
8161 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8162 */
8163 static int vmFindContiguousPages(off_t *first, off_t n) {
8164 off_t base, offset = 0, since_jump = 0, numfree = 0;
8165
8166 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8167 server.vm_near_pages = 0;
8168 server.vm_next_page = 0;
8169 }
8170 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8171 base = server.vm_next_page;
8172
8173 while(offset < server.vm_pages) {
8174 off_t this = base+offset;
8175
8176 /* If we overflow, restart from page zero */
8177 if (this >= server.vm_pages) {
8178 this -= server.vm_pages;
8179 if (this == 0) {
8180 /* Just overflowed, what we found on tail is no longer
8181 * interesting, as it's no longer contiguous. */
8182 numfree = 0;
8183 }
8184 }
8185 if (vmFreePage(this)) {
8186 /* This is a free page */
8187 numfree++;
8188 /* Already got N free pages? Return to the caller, with success */
8189 if (numfree == n) {
8190 *first = this-(n-1);
8191 server.vm_next_page = this+1;
8192 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8193 return REDIS_OK;
8194 }
8195 } else {
8196 /* The current one is not a free page */
8197 numfree = 0;
8198 }
8199
8200 /* Fast-forward if the current page is not free and we already
8201 * searched enough near this place. */
8202 since_jump++;
8203 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8204 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8205 since_jump = 0;
8206 /* Note that even if we rewind after the jump, we are don't need
8207 * to make sure numfree is set to zero as we only jump *if* it
8208 * is set to zero. */
8209 } else {
8210 /* Otherwise just check the next page */
8211 offset++;
8212 }
8213 }
8214 return REDIS_ERR;
8215 }
8216
8217 /* Write the specified object at the specified page of the swap file */
8218 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8219 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8220 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8221 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8222 redisLog(REDIS_WARNING,
8223 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8224 strerror(errno));
8225 return REDIS_ERR;
8226 }
8227 rdbSaveObject(server.vm_fp,o);
8228 fflush(server.vm_fp);
8229 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8230 return REDIS_OK;
8231 }
8232
8233 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8234 * needed to later retrieve the object into the key object.
8235 * If we can't find enough contiguous empty pages to swap the object on disk
8236 * REDIS_ERR is returned. */
8237 static int vmSwapObjectBlocking(robj *key, robj *val) {
8238 off_t pages = rdbSavedObjectPages(val,NULL);
8239 off_t page;
8240
8241 assert(key->storage == REDIS_VM_MEMORY);
8242 assert(key->refcount == 1);
8243 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8244 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8245 key->vm.page = page;
8246 key->vm.usedpages = pages;
8247 key->storage = REDIS_VM_SWAPPED;
8248 key->vtype = val->type;
8249 decrRefCount(val); /* Deallocate the object from memory. */
8250 vmMarkPagesUsed(page,pages);
8251 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8252 (unsigned char*) key->ptr,
8253 (unsigned long long) page, (unsigned long long) pages);
8254 server.vm_stats_swapped_objects++;
8255 server.vm_stats_swapouts++;
8256 return REDIS_OK;
8257 }
8258
8259 static robj *vmReadObjectFromSwap(off_t page, int type) {
8260 robj *o;
8261
8262 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8263 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8264 redisLog(REDIS_WARNING,
8265 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8266 strerror(errno));
8267 _exit(1);
8268 }
8269 o = rdbLoadObject(type,server.vm_fp);
8270 if (o == NULL) {
8271 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8272 _exit(1);
8273 }
8274 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8275 return o;
8276 }
8277
8278 /* Load the value object relative to the 'key' object from swap to memory.
8279 * The newly allocated object is returned.
8280 *
8281 * If preview is true the unserialized object is returned to the caller but
8282 * no changes are made to the key object, nor the pages are marked as freed */
8283 static robj *vmGenericLoadObject(robj *key, int preview) {
8284 robj *val;
8285
8286 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8287 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8288 if (!preview) {
8289 key->storage = REDIS_VM_MEMORY;
8290 key->vm.atime = server.unixtime;
8291 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8292 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8293 (unsigned char*) key->ptr);
8294 server.vm_stats_swapped_objects--;
8295 } else {
8296 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8297 (unsigned char*) key->ptr);
8298 }
8299 server.vm_stats_swapins++;
8300 return val;
8301 }
8302
8303 /* Plain object loading, from swap to memory */
8304 static robj *vmLoadObject(robj *key) {
8305 /* If we are loading the object in background, stop it, we
8306 * need to load this object synchronously ASAP. */
8307 if (key->storage == REDIS_VM_LOADING)
8308 vmCancelThreadedIOJob(key);
8309 return vmGenericLoadObject(key,0);
8310 }
8311
8312 /* Just load the value on disk, without to modify the key.
8313 * This is useful when we want to perform some operation on the value
8314 * without to really bring it from swap to memory, like while saving the
8315 * dataset or rewriting the append only log. */
8316 static robj *vmPreviewObject(robj *key) {
8317 return vmGenericLoadObject(key,1);
8318 }
8319
8320 /* How a good candidate is this object for swapping?
8321 * The better candidate it is, the greater the returned value.
8322 *
8323 * Currently we try to perform a fast estimation of the object size in
8324 * memory, and combine it with aging informations.
8325 *
8326 * Basically swappability = idle-time * log(estimated size)
8327 *
8328 * Bigger objects are preferred over smaller objects, but not
8329 * proportionally, this is why we use the logarithm. This algorithm is
8330 * just a first try and will probably be tuned later. */
8331 static double computeObjectSwappability(robj *o) {
8332 time_t age = server.unixtime - o->vm.atime;
8333 long asize = 0;
8334 list *l;
8335 dict *d;
8336 struct dictEntry *de;
8337 int z;
8338
8339 if (age <= 0) return 0;
8340 switch(o->type) {
8341 case REDIS_STRING:
8342 if (o->encoding != REDIS_ENCODING_RAW) {
8343 asize = sizeof(*o);
8344 } else {
8345 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8346 }
8347 break;
8348 case REDIS_LIST:
8349 l = o->ptr;
8350 listNode *ln = listFirst(l);
8351
8352 asize = sizeof(list);
8353 if (ln) {
8354 robj *ele = ln->value;
8355 long elesize;
8356
8357 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8358 (sizeof(*o)+sdslen(ele->ptr)) :
8359 sizeof(*o);
8360 asize += (sizeof(listNode)+elesize)*listLength(l);
8361 }
8362 break;
8363 case REDIS_SET:
8364 case REDIS_ZSET:
8365 z = (o->type == REDIS_ZSET);
8366 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8367
8368 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8369 if (z) asize += sizeof(zset)-sizeof(dict);
8370 if (dictSize(d)) {
8371 long elesize;
8372 robj *ele;
8373
8374 de = dictGetRandomKey(d);
8375 ele = dictGetEntryKey(de);
8376 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8377 (sizeof(*o)+sdslen(ele->ptr)) :
8378 sizeof(*o);
8379 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8380 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8381 }
8382 break;
8383 case REDIS_HASH:
8384 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8385 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8386 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8387 unsigned int klen, vlen;
8388 unsigned char *key, *val;
8389
8390 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8391 klen = 0;
8392 vlen = 0;
8393 }
8394 asize = len*(klen+vlen+3);
8395 } else if (o->encoding == REDIS_ENCODING_HT) {
8396 d = o->ptr;
8397 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8398 if (dictSize(d)) {
8399 long elesize;
8400 robj *ele;
8401
8402 de = dictGetRandomKey(d);
8403 ele = dictGetEntryKey(de);
8404 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8405 (sizeof(*o)+sdslen(ele->ptr)) :
8406 sizeof(*o);
8407 ele = dictGetEntryVal(de);
8408 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8409 (sizeof(*o)+sdslen(ele->ptr)) :
8410 sizeof(*o);
8411 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8412 }
8413 }
8414 break;
8415 }
8416 return (double)age*log(1+asize);
8417 }
8418
8419 /* Try to swap an object that's a good candidate for swapping.
8420 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8421 * to swap any object at all.
8422 *
8423 * If 'usethreaded' is true, Redis will try to swap the object in background
8424 * using I/O threads. */
8425 static int vmSwapOneObject(int usethreads) {
8426 int j, i;
8427 struct dictEntry *best = NULL;
8428 double best_swappability = 0;
8429 redisDb *best_db = NULL;
8430 robj *key, *val;
8431
8432 for (j = 0; j < server.dbnum; j++) {
8433 redisDb *db = server.db+j;
8434 /* Why maxtries is set to 100?
8435 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8436 * are swappable objects */
8437 int maxtries = 100;
8438
8439 if (dictSize(db->dict) == 0) continue;
8440 for (i = 0; i < 5; i++) {
8441 dictEntry *de;
8442 double swappability;
8443
8444 if (maxtries) maxtries--;
8445 de = dictGetRandomKey(db->dict);
8446 key = dictGetEntryKey(de);
8447 val = dictGetEntryVal(de);
8448 /* Only swap objects that are currently in memory.
8449 *
8450 * Also don't swap shared objects if threaded VM is on, as we
8451 * try to ensure that the main thread does not touch the
8452 * object while the I/O thread is using it, but we can't
8453 * control other keys without adding additional mutex. */
8454 if (key->storage != REDIS_VM_MEMORY ||
8455 (server.vm_max_threads != 0 && val->refcount != 1)) {
8456 if (maxtries) i--; /* don't count this try */
8457 continue;
8458 }
8459 swappability = computeObjectSwappability(val);
8460 if (!best || swappability > best_swappability) {
8461 best = de;
8462 best_swappability = swappability;
8463 best_db = db;
8464 }
8465 }
8466 }
8467 if (best == NULL) return REDIS_ERR;
8468 key = dictGetEntryKey(best);
8469 val = dictGetEntryVal(best);
8470
8471 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8472 key->ptr, best_swappability);
8473
8474 /* Unshare the key if needed */
8475 if (key->refcount > 1) {
8476 robj *newkey = dupStringObject(key);
8477 decrRefCount(key);
8478 key = dictGetEntryKey(best) = newkey;
8479 }
8480 /* Swap it */
8481 if (usethreads) {
8482 vmSwapObjectThreaded(key,val,best_db);
8483 return REDIS_OK;
8484 } else {
8485 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8486 dictGetEntryVal(best) = NULL;
8487 return REDIS_OK;
8488 } else {
8489 return REDIS_ERR;
8490 }
8491 }
8492 }
8493
8494 static int vmSwapOneObjectBlocking() {
8495 return vmSwapOneObject(0);
8496 }
8497
8498 static int vmSwapOneObjectThreaded() {
8499 return vmSwapOneObject(1);
8500 }
8501
8502 /* Return true if it's safe to swap out objects in a given moment.
8503 * Basically we don't want to swap objects out while there is a BGSAVE
8504 * or a BGAEOREWRITE running in backgroud. */
8505 static int vmCanSwapOut(void) {
8506 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8507 }
8508
8509 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8510 * and was deleted. Otherwise 0 is returned. */
8511 static int deleteIfSwapped(redisDb *db, robj *key) {
8512 dictEntry *de;
8513 robj *foundkey;
8514
8515 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8516 foundkey = dictGetEntryKey(de);
8517 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8518 deleteKey(db,key);
8519 return 1;
8520 }
8521
8522 /* =================== Virtual Memory - Threaded I/O ======================= */
8523
8524 static void freeIOJob(iojob *j) {
8525 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8526 j->type == REDIS_IOJOB_DO_SWAP ||
8527 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8528 decrRefCount(j->val);
8529 decrRefCount(j->key);
8530 zfree(j);
8531 }
8532
8533 /* Every time a thread finished a Job, it writes a byte into the write side
8534 * of an unix pipe in order to "awake" the main thread, and this function
8535 * is called. */
8536 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8537 int mask)
8538 {
8539 char buf[1];
8540 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8541 REDIS_NOTUSED(el);
8542 REDIS_NOTUSED(mask);
8543 REDIS_NOTUSED(privdata);
8544
8545 /* For every byte we read in the read side of the pipe, there is one
8546 * I/O job completed to process. */
8547 while((retval = read(fd,buf,1)) == 1) {
8548 iojob *j;
8549 listNode *ln;
8550 robj *key;
8551 struct dictEntry *de;
8552
8553 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8554
8555 /* Get the processed element (the oldest one) */
8556 lockThreadedIO();
8557 assert(listLength(server.io_processed) != 0);
8558 if (toprocess == -1) {
8559 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8560 if (toprocess <= 0) toprocess = 1;
8561 }
8562 ln = listFirst(server.io_processed);
8563 j = ln->value;
8564 listDelNode(server.io_processed,ln);
8565 unlockThreadedIO();
8566 /* If this job is marked as canceled, just ignore it */
8567 if (j->canceled) {
8568 freeIOJob(j);
8569 continue;
8570 }
8571 /* Post process it in the main thread, as there are things we
8572 * can do just here to avoid race conditions and/or invasive locks */
8573 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8574 de = dictFind(j->db->dict,j->key);
8575 assert(de != NULL);
8576 key = dictGetEntryKey(de);
8577 if (j->type == REDIS_IOJOB_LOAD) {
8578 redisDb *db;
8579
8580 /* Key loaded, bring it at home */
8581 key->storage = REDIS_VM_MEMORY;
8582 key->vm.atime = server.unixtime;
8583 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8584 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8585 (unsigned char*) key->ptr);
8586 server.vm_stats_swapped_objects--;
8587 server.vm_stats_swapins++;
8588 dictGetEntryVal(de) = j->val;
8589 incrRefCount(j->val);
8590 db = j->db;
8591 freeIOJob(j);
8592 /* Handle clients waiting for this key to be loaded. */
8593 handleClientsBlockedOnSwappedKey(db,key);
8594 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8595 /* Now we know the amount of pages required to swap this object.
8596 * Let's find some space for it, and queue this task again
8597 * rebranded as REDIS_IOJOB_DO_SWAP. */
8598 if (!vmCanSwapOut() ||
8599 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8600 {
8601 /* Ooops... no space or we can't swap as there is
8602 * a fork()ed Redis trying to save stuff on disk. */
8603 freeIOJob(j);
8604 key->storage = REDIS_VM_MEMORY; /* undo operation */
8605 } else {
8606 /* Note that we need to mark this pages as used now,
8607 * if the job will be canceled, we'll mark them as freed
8608 * again. */
8609 vmMarkPagesUsed(j->page,j->pages);
8610 j->type = REDIS_IOJOB_DO_SWAP;
8611 lockThreadedIO();
8612 queueIOJob(j);
8613 unlockThreadedIO();
8614 }
8615 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8616 robj *val;
8617
8618 /* Key swapped. We can finally free some memory. */
8619 if (key->storage != REDIS_VM_SWAPPING) {
8620 printf("key->storage: %d\n",key->storage);
8621 printf("key->name: %s\n",(char*)key->ptr);
8622 printf("key->refcount: %d\n",key->refcount);
8623 printf("val: %p\n",(void*)j->val);
8624 printf("val->type: %d\n",j->val->type);
8625 printf("val->ptr: %s\n",(char*)j->val->ptr);
8626 }
8627 redisAssert(key->storage == REDIS_VM_SWAPPING);
8628 val = dictGetEntryVal(de);
8629 key->vm.page = j->page;
8630 key->vm.usedpages = j->pages;
8631 key->storage = REDIS_VM_SWAPPED;
8632 key->vtype = j->val->type;
8633 decrRefCount(val); /* Deallocate the object from memory. */
8634 dictGetEntryVal(de) = NULL;
8635 redisLog(REDIS_DEBUG,
8636 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8637 (unsigned char*) key->ptr,
8638 (unsigned long long) j->page, (unsigned long long) j->pages);
8639 server.vm_stats_swapped_objects++;
8640 server.vm_stats_swapouts++;
8641 freeIOJob(j);
8642 /* Put a few more swap requests in queue if we are still
8643 * out of memory */
8644 if (trytoswap && vmCanSwapOut() &&
8645 zmalloc_used_memory() > server.vm_max_memory)
8646 {
8647 int more = 1;
8648 while(more) {
8649 lockThreadedIO();
8650 more = listLength(server.io_newjobs) <
8651 (unsigned) server.vm_max_threads;
8652 unlockThreadedIO();
8653 /* Don't waste CPU time if swappable objects are rare. */
8654 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8655 trytoswap = 0;
8656 break;
8657 }
8658 }
8659 }
8660 }
8661 processed++;
8662 if (processed == toprocess) return;
8663 }
8664 if (retval < 0 && errno != EAGAIN) {
8665 redisLog(REDIS_WARNING,
8666 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8667 strerror(errno));
8668 }
8669 }
8670
8671 static void lockThreadedIO(void) {
8672 pthread_mutex_lock(&server.io_mutex);
8673 }
8674
8675 static void unlockThreadedIO(void) {
8676 pthread_mutex_unlock(&server.io_mutex);
8677 }
8678
8679 /* Remove the specified object from the threaded I/O queue if still not
8680 * processed, otherwise make sure to flag it as canceled. */
8681 static void vmCancelThreadedIOJob(robj *o) {
8682 list *lists[3] = {
8683 server.io_newjobs, /* 0 */
8684 server.io_processing, /* 1 */
8685 server.io_processed /* 2 */
8686 };
8687 int i;
8688
8689 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8690 again:
8691 lockThreadedIO();
8692 /* Search for a matching key in one of the queues */
8693 for (i = 0; i < 3; i++) {
8694 listNode *ln;
8695 listIter li;
8696
8697 listRewind(lists[i],&li);
8698 while ((ln = listNext(&li)) != NULL) {
8699 iojob *job = ln->value;
8700
8701 if (job->canceled) continue; /* Skip this, already canceled. */
8702 if (compareStringObjects(job->key,o) == 0) {
8703 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8704 (void*)job, (char*)o->ptr, job->type, i);
8705 /* Mark the pages as free since the swap didn't happened
8706 * or happened but is now discarded. */
8707 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8708 vmMarkPagesFree(job->page,job->pages);
8709 /* Cancel the job. It depends on the list the job is
8710 * living in. */
8711 switch(i) {
8712 case 0: /* io_newjobs */
8713 /* If the job was yet not processed the best thing to do
8714 * is to remove it from the queue at all */
8715 freeIOJob(job);
8716 listDelNode(lists[i],ln);
8717 break;
8718 case 1: /* io_processing */
8719 /* Oh Shi- the thread is messing with the Job:
8720 *
8721 * Probably it's accessing the object if this is a
8722 * PREPARE_SWAP or DO_SWAP job.
8723 * If it's a LOAD job it may be reading from disk and
8724 * if we don't wait for the job to terminate before to
8725 * cancel it, maybe in a few microseconds data can be
8726 * corrupted in this pages. So the short story is:
8727 *
8728 * Better to wait for the job to move into the
8729 * next queue (processed)... */
8730
8731 /* We try again and again until the job is completed. */
8732 unlockThreadedIO();
8733 /* But let's wait some time for the I/O thread
8734 * to finish with this job. After all this condition
8735 * should be very rare. */
8736 usleep(1);
8737 goto again;
8738 case 2: /* io_processed */
8739 /* The job was already processed, that's easy...
8740 * just mark it as canceled so that we'll ignore it
8741 * when processing completed jobs. */
8742 job->canceled = 1;
8743 break;
8744 }
8745 /* Finally we have to adjust the storage type of the object
8746 * in order to "UNDO" the operaiton. */
8747 if (o->storage == REDIS_VM_LOADING)
8748 o->storage = REDIS_VM_SWAPPED;
8749 else if (o->storage == REDIS_VM_SWAPPING)
8750 o->storage = REDIS_VM_MEMORY;
8751 unlockThreadedIO();
8752 return;
8753 }
8754 }
8755 }
8756 unlockThreadedIO();
8757 assert(1 != 1); /* We should never reach this */
8758 }
8759
8760 static void *IOThreadEntryPoint(void *arg) {
8761 iojob *j;
8762 listNode *ln;
8763 REDIS_NOTUSED(arg);
8764
8765 pthread_detach(pthread_self());
8766 while(1) {
8767 /* Get a new job to process */
8768 lockThreadedIO();
8769 if (listLength(server.io_newjobs) == 0) {
8770 /* No new jobs in queue, exit. */
8771 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8772 (long) pthread_self());
8773 server.io_active_threads--;
8774 unlockThreadedIO();
8775 return NULL;
8776 }
8777 ln = listFirst(server.io_newjobs);
8778 j = ln->value;
8779 listDelNode(server.io_newjobs,ln);
8780 /* Add the job in the processing queue */
8781 j->thread = pthread_self();
8782 listAddNodeTail(server.io_processing,j);
8783 ln = listLast(server.io_processing); /* We use ln later to remove it */
8784 unlockThreadedIO();
8785 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8786 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8787
8788 /* Process the Job */
8789 if (j->type == REDIS_IOJOB_LOAD) {
8790 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8791 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8792 FILE *fp = fopen("/dev/null","w+");
8793 j->pages = rdbSavedObjectPages(j->val,fp);
8794 fclose(fp);
8795 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8796 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8797 j->canceled = 1;
8798 }
8799
8800 /* Done: insert the job into the processed queue */
8801 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8802 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8803 lockThreadedIO();
8804 listDelNode(server.io_processing,ln);
8805 listAddNodeTail(server.io_processed,j);
8806 unlockThreadedIO();
8807
8808 /* Signal the main thread there is new stuff to process */
8809 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8810 }
8811 return NULL; /* never reached */
8812 }
8813
8814 static void spawnIOThread(void) {
8815 pthread_t thread;
8816 sigset_t mask, omask;
8817 int err;
8818
8819 sigemptyset(&mask);
8820 sigaddset(&mask,SIGCHLD);
8821 sigaddset(&mask,SIGHUP);
8822 sigaddset(&mask,SIGPIPE);
8823 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8824 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8825 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8826 strerror(err));
8827 usleep(1000000);
8828 }
8829 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8830 server.io_active_threads++;
8831 }
8832
8833 /* We need to wait for the last thread to exit before we are able to
8834 * fork() in order to BGSAVE or BGREWRITEAOF. */
8835 static void waitEmptyIOJobsQueue(void) {
8836 while(1) {
8837 int io_processed_len;
8838
8839 lockThreadedIO();
8840 if (listLength(server.io_newjobs) == 0 &&
8841 listLength(server.io_processing) == 0 &&
8842 server.io_active_threads == 0)
8843 {
8844 unlockThreadedIO();
8845 return;
8846 }
8847 /* While waiting for empty jobs queue condition we post-process some
8848 * finshed job, as I/O threads may be hanging trying to write against
8849 * the io_ready_pipe_write FD but there are so much pending jobs that
8850 * it's blocking. */
8851 io_processed_len = listLength(server.io_processed);
8852 unlockThreadedIO();
8853 if (io_processed_len) {
8854 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8855 usleep(1000); /* 1 millisecond */
8856 } else {
8857 usleep(10000); /* 10 milliseconds */
8858 }
8859 }
8860 }
8861
8862 static void vmReopenSwapFile(void) {
8863 /* Note: we don't close the old one as we are in the child process
8864 * and don't want to mess at all with the original file object. */
8865 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8866 if (server.vm_fp == NULL) {
8867 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8868 server.vm_swap_file);
8869 _exit(1);
8870 }
8871 server.vm_fd = fileno(server.vm_fp);
8872 }
8873
8874 /* This function must be called while with threaded IO locked */
8875 static void queueIOJob(iojob *j) {
8876 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8877 (void*)j, j->type, (char*)j->key->ptr);
8878 listAddNodeTail(server.io_newjobs,j);
8879 if (server.io_active_threads < server.vm_max_threads)
8880 spawnIOThread();
8881 }
8882
8883 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8884 iojob *j;
8885
8886 assert(key->storage == REDIS_VM_MEMORY);
8887 assert(key->refcount == 1);
8888
8889 j = zmalloc(sizeof(*j));
8890 j->type = REDIS_IOJOB_PREPARE_SWAP;
8891 j->db = db;
8892 j->key = dupStringObject(key);
8893 j->val = val;
8894 incrRefCount(val);
8895 j->canceled = 0;
8896 j->thread = (pthread_t) -1;
8897 key->storage = REDIS_VM_SWAPPING;
8898
8899 lockThreadedIO();
8900 queueIOJob(j);
8901 unlockThreadedIO();
8902 return REDIS_OK;
8903 }
8904
8905 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8906
8907 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8908 * If there is not already a job loading the key, it is craeted.
8909 * The key is added to the io_keys list in the client structure, and also
8910 * in the hash table mapping swapped keys to waiting clients, that is,
8911 * server.io_waited_keys. */
8912 static int waitForSwappedKey(redisClient *c, robj *key) {
8913 struct dictEntry *de;
8914 robj *o;
8915 list *l;
8916
8917 /* If the key does not exist or is already in RAM we don't need to
8918 * block the client at all. */
8919 de = dictFind(c->db->dict,key);
8920 if (de == NULL) return 0;
8921 o = dictGetEntryKey(de);
8922 if (o->storage == REDIS_VM_MEMORY) {
8923 return 0;
8924 } else if (o->storage == REDIS_VM_SWAPPING) {
8925 /* We were swapping the key, undo it! */
8926 vmCancelThreadedIOJob(o);
8927 return 0;
8928 }
8929
8930 /* OK: the key is either swapped, or being loaded just now. */
8931
8932 /* Add the key to the list of keys this client is waiting for.
8933 * This maps clients to keys they are waiting for. */
8934 listAddNodeTail(c->io_keys,key);
8935 incrRefCount(key);
8936
8937 /* Add the client to the swapped keys => clients waiting map. */
8938 de = dictFind(c->db->io_keys,key);
8939 if (de == NULL) {
8940 int retval;
8941
8942 /* For every key we take a list of clients blocked for it */
8943 l = listCreate();
8944 retval = dictAdd(c->db->io_keys,key,l);
8945 incrRefCount(key);
8946 assert(retval == DICT_OK);
8947 } else {
8948 l = dictGetEntryVal(de);
8949 }
8950 listAddNodeTail(l,c);
8951
8952 /* Are we already loading the key from disk? If not create a job */
8953 if (o->storage == REDIS_VM_SWAPPED) {
8954 iojob *j;
8955
8956 o->storage = REDIS_VM_LOADING;
8957 j = zmalloc(sizeof(*j));
8958 j->type = REDIS_IOJOB_LOAD;
8959 j->db = c->db;
8960 j->key = dupStringObject(key);
8961 j->key->vtype = o->vtype;
8962 j->page = o->vm.page;
8963 j->val = NULL;
8964 j->canceled = 0;
8965 j->thread = (pthread_t) -1;
8966 lockThreadedIO();
8967 queueIOJob(j);
8968 unlockThreadedIO();
8969 }
8970 return 1;
8971 }
8972
8973 /* Preload keys needed for the ZUNION and ZINTER commands. */
8974 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
8975 int i, num;
8976 num = atoi(c->argv[2]->ptr);
8977 for (i = 0; i < num; i++) {
8978 waitForSwappedKey(c,c->argv[3+i]);
8979 }
8980 }
8981
8982 /* Is this client attempting to run a command against swapped keys?
8983 * If so, block it ASAP, load the keys in background, then resume it.
8984 *
8985 * The important idea about this function is that it can fail! If keys will
8986 * still be swapped when the client is resumed, this key lookups will
8987 * just block loading keys from disk. In practical terms this should only
8988 * happen with SORT BY command or if there is a bug in this function.
8989 *
8990 * Return 1 if the client is marked as blocked, 0 if the client can
8991 * continue as the keys it is going to access appear to be in memory. */
8992 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8993 int j, last;
8994
8995 if (cmd->vm_preload_proc != NULL) {
8996 cmd->vm_preload_proc(c);
8997 } else {
8998 if (cmd->vm_firstkey == 0) return 0;
8999 last = cmd->vm_lastkey;
9000 if (last < 0) last = c->argc+last;
9001 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9002 waitForSwappedKey(c,c->argv[j]);
9003 }
9004
9005 /* If the client was blocked for at least one key, mark it as blocked. */
9006 if (listLength(c->io_keys)) {
9007 c->flags |= REDIS_IO_WAIT;
9008 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9009 server.vm_blocked_clients++;
9010 return 1;
9011 } else {
9012 return 0;
9013 }
9014 }
9015
9016 /* Remove the 'key' from the list of blocked keys for a given client.
9017 *
9018 * The function returns 1 when there are no longer blocking keys after
9019 * the current one was removed (and the client can be unblocked). */
9020 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9021 list *l;
9022 listNode *ln;
9023 listIter li;
9024 struct dictEntry *de;
9025
9026 /* Remove the key from the list of keys this client is waiting for. */
9027 listRewind(c->io_keys,&li);
9028 while ((ln = listNext(&li)) != NULL) {
9029 if (compareStringObjects(ln->value,key) == 0) {
9030 listDelNode(c->io_keys,ln);
9031 break;
9032 }
9033 }
9034 assert(ln != NULL);
9035
9036 /* Remove the client form the key => waiting clients map. */
9037 de = dictFind(c->db->io_keys,key);
9038 assert(de != NULL);
9039 l = dictGetEntryVal(de);
9040 ln = listSearchKey(l,c);
9041 assert(ln != NULL);
9042 listDelNode(l,ln);
9043 if (listLength(l) == 0)
9044 dictDelete(c->db->io_keys,key);
9045
9046 return listLength(c->io_keys) == 0;
9047 }
9048
9049 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9050 struct dictEntry *de;
9051 list *l;
9052 listNode *ln;
9053 int len;
9054
9055 de = dictFind(db->io_keys,key);
9056 if (!de) return;
9057
9058 l = dictGetEntryVal(de);
9059 len = listLength(l);
9060 /* Note: we can't use something like while(listLength(l)) as the list
9061 * can be freed by the calling function when we remove the last element. */
9062 while (len--) {
9063 ln = listFirst(l);
9064 redisClient *c = ln->value;
9065
9066 if (dontWaitForSwappedKey(c,key)) {
9067 /* Put the client in the list of clients ready to go as we
9068 * loaded all the keys about it. */
9069 listAddNodeTail(server.io_ready_clients,c);
9070 }
9071 }
9072 }
9073
9074 /* =========================== Remote Configuration ========================= */
9075
9076 static void configSetCommand(redisClient *c) {
9077 robj *o = getDecodedObject(c->argv[3]);
9078 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9079 zfree(server.dbfilename);
9080 server.dbfilename = zstrdup(o->ptr);
9081 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9082 zfree(server.requirepass);
9083 server.requirepass = zstrdup(o->ptr);
9084 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9085 zfree(server.masterauth);
9086 server.masterauth = zstrdup(o->ptr);
9087 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9088 server.maxmemory = strtoll(o->ptr, NULL, 10);
9089 } else {
9090 addReplySds(c,sdscatprintf(sdsempty(),
9091 "-ERR not supported CONFIG parameter %s\r\n",
9092 (char*)c->argv[2]->ptr));
9093 decrRefCount(o);
9094 return;
9095 }
9096 decrRefCount(o);
9097 addReply(c,shared.ok);
9098 }
9099
9100 static void configGetCommand(redisClient *c) {
9101 robj *o = getDecodedObject(c->argv[2]);
9102 robj *lenobj = createObject(REDIS_STRING,NULL);
9103 char *pattern = o->ptr;
9104 int matches = 0;
9105
9106 addReply(c,lenobj);
9107 decrRefCount(lenobj);
9108
9109 if (stringmatch(pattern,"dbfilename",0)) {
9110 addReplyBulkCString(c,"dbfilename");
9111 addReplyBulkCString(c,server.dbfilename);
9112 matches++;
9113 }
9114 if (stringmatch(pattern,"requirepass",0)) {
9115 addReplyBulkCString(c,"requirepass");
9116 addReplyBulkCString(c,server.requirepass);
9117 matches++;
9118 }
9119 if (stringmatch(pattern,"masterauth",0)) {
9120 addReplyBulkCString(c,"masterauth");
9121 addReplyBulkCString(c,server.masterauth);
9122 matches++;
9123 }
9124 if (stringmatch(pattern,"maxmemory",0)) {
9125 char buf[128];
9126
9127 snprintf(buf,128,"%llu\n",server.maxmemory);
9128 addReplyBulkCString(c,"maxmemory");
9129 addReplyBulkCString(c,buf);
9130 matches++;
9131 }
9132 decrRefCount(o);
9133 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9134 }
9135
9136 static void configCommand(redisClient *c) {
9137 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9138 if (c->argc != 4) goto badarity;
9139 configSetCommand(c);
9140 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9141 if (c->argc != 3) goto badarity;
9142 configGetCommand(c);
9143 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9144 if (c->argc != 2) goto badarity;
9145 server.stat_numcommands = 0;
9146 server.stat_numconnections = 0;
9147 server.stat_expiredkeys = 0;
9148 server.stat_starttime = time(NULL);
9149 addReply(c,shared.ok);
9150 } else {
9151 addReplySds(c,sdscatprintf(sdsempty(),
9152 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9153 }
9154 return;
9155
9156 badarity:
9157 addReplySds(c,sdscatprintf(sdsempty(),
9158 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9159 (char*) c->argv[1]->ptr));
9160 }
9161
9162 /* ================================= Debugging ============================== */
9163
9164 static void debugCommand(redisClient *c) {
9165 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9166 *((char*)-1) = 'x';
9167 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9168 if (rdbSave(server.dbfilename) != REDIS_OK) {
9169 addReply(c,shared.err);
9170 return;
9171 }
9172 emptyDb();
9173 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9174 addReply(c,shared.err);
9175 return;
9176 }
9177 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9178 addReply(c,shared.ok);
9179 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9180 emptyDb();
9181 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9182 addReply(c,shared.err);
9183 return;
9184 }
9185 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9186 addReply(c,shared.ok);
9187 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9188 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9189 robj *key, *val;
9190
9191 if (!de) {
9192 addReply(c,shared.nokeyerr);
9193 return;
9194 }
9195 key = dictGetEntryKey(de);
9196 val = dictGetEntryVal(de);
9197 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9198 key->storage == REDIS_VM_SWAPPING)) {
9199 char *strenc;
9200 char buf[128];
9201
9202 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9203 strenc = strencoding[val->encoding];
9204 } else {
9205 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9206 strenc = buf;
9207 }
9208 addReplySds(c,sdscatprintf(sdsempty(),
9209 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9210 "encoding:%s serializedlength:%lld\r\n",
9211 (void*)key, key->refcount, (void*)val, val->refcount,
9212 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9213 } else {
9214 addReplySds(c,sdscatprintf(sdsempty(),
9215 "+Key at:%p refcount:%d, value swapped at: page %llu "
9216 "using %llu pages\r\n",
9217 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9218 (unsigned long long) key->vm.usedpages));
9219 }
9220 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9221 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9222 robj *key, *val;
9223
9224 if (!server.vm_enabled) {
9225 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9226 return;
9227 }
9228 if (!de) {
9229 addReply(c,shared.nokeyerr);
9230 return;
9231 }
9232 key = dictGetEntryKey(de);
9233 val = dictGetEntryVal(de);
9234 /* If the key is shared we want to create a copy */
9235 if (key->refcount > 1) {
9236 robj *newkey = dupStringObject(key);
9237 decrRefCount(key);
9238 key = dictGetEntryKey(de) = newkey;
9239 }
9240 /* Swap it */
9241 if (key->storage != REDIS_VM_MEMORY) {
9242 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9243 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9244 dictGetEntryVal(de) = NULL;
9245 addReply(c,shared.ok);
9246 } else {
9247 addReply(c,shared.err);
9248 }
9249 } else {
9250 addReplySds(c,sdsnew(
9251 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
9252 }
9253 }
9254
9255 static void _redisAssert(char *estr, char *file, int line) {
9256 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9257 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9258 #ifdef HAVE_BACKTRACE
9259 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9260 *((char*)-1) = 'x';
9261 #endif
9262 }
9263
9264 /* =================================== Main! ================================ */
9265
9266 #ifdef __linux__
9267 int linuxOvercommitMemoryValue(void) {
9268 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9269 char buf[64];
9270
9271 if (!fp) return -1;
9272 if (fgets(buf,64,fp) == NULL) {
9273 fclose(fp);
9274 return -1;
9275 }
9276 fclose(fp);
9277
9278 return atoi(buf);
9279 }
9280
9281 void linuxOvercommitMemoryWarning(void) {
9282 if (linuxOvercommitMemoryValue() == 0) {
9283 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9284 }
9285 }
9286 #endif /* __linux__ */
9287
9288 static void daemonize(void) {
9289 int fd;
9290 FILE *fp;
9291
9292 if (fork() != 0) exit(0); /* parent exits */
9293 setsid(); /* create a new session */
9294
9295 /* Every output goes to /dev/null. If Redis is daemonized but
9296 * the 'logfile' is set to 'stdout' in the configuration file
9297 * it will not log at all. */
9298 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9299 dup2(fd, STDIN_FILENO);
9300 dup2(fd, STDOUT_FILENO);
9301 dup2(fd, STDERR_FILENO);
9302 if (fd > STDERR_FILENO) close(fd);
9303 }
9304 /* Try to write the pid file */
9305 fp = fopen(server.pidfile,"w");
9306 if (fp) {
9307 fprintf(fp,"%d\n",getpid());
9308 fclose(fp);
9309 }
9310 }
9311
9312 static void version() {
9313 printf("Redis server version %s\n", REDIS_VERSION);
9314 exit(0);
9315 }
9316
9317 static void usage() {
9318 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9319 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9320 exit(1);
9321 }
9322
9323 int main(int argc, char **argv) {
9324 time_t start;
9325
9326 initServerConfig();
9327 if (argc == 2) {
9328 if (strcmp(argv[1], "-v") == 0 ||
9329 strcmp(argv[1], "--version") == 0) version();
9330 if (strcmp(argv[1], "--help") == 0) usage();
9331 resetServerSaveParams();
9332 loadServerConfig(argv[1]);
9333 } else if ((argc > 2)) {
9334 usage();
9335 } else {
9336 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9337 }
9338 if (server.daemonize) daemonize();
9339 initServer();
9340 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9341 #ifdef __linux__
9342 linuxOvercommitMemoryWarning();
9343 #endif
9344 start = time(NULL);
9345 if (server.appendonly) {
9346 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9347 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9348 } else {
9349 if (rdbLoad(server.dbfilename) == REDIS_OK)
9350 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9351 }
9352 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9353 aeSetBeforeSleepProc(server.el,beforeSleep);
9354 aeMain(server.el);
9355 aeDeleteEventLoop(server.el);
9356 return 0;
9357 }
9358
9359 /* ============================= Backtrace support ========================= */
9360
9361 #ifdef HAVE_BACKTRACE
9362 static char *findFuncName(void *pointer, unsigned long *offset);
9363
9364 static void *getMcontextEip(ucontext_t *uc) {
9365 #if defined(__FreeBSD__)
9366 return (void*) uc->uc_mcontext.mc_eip;
9367 #elif defined(__dietlibc__)
9368 return (void*) uc->uc_mcontext.eip;
9369 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9370 #if __x86_64__
9371 return (void*) uc->uc_mcontext->__ss.__rip;
9372 #else
9373 return (void*) uc->uc_mcontext->__ss.__eip;
9374 #endif
9375 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9376 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9377 return (void*) uc->uc_mcontext->__ss.__rip;
9378 #else
9379 return (void*) uc->uc_mcontext->__ss.__eip;
9380 #endif
9381 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9382 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9383 #elif defined(__ia64__) /* Linux IA64 */
9384 return (void*) uc->uc_mcontext.sc_ip;
9385 #else
9386 return NULL;
9387 #endif
9388 }
9389
9390 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9391 void *trace[100];
9392 char **messages = NULL;
9393 int i, trace_size = 0;
9394 unsigned long offset=0;
9395 ucontext_t *uc = (ucontext_t*) secret;
9396 sds infostring;
9397 REDIS_NOTUSED(info);
9398
9399 redisLog(REDIS_WARNING,
9400 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9401 infostring = genRedisInfoString();
9402 redisLog(REDIS_WARNING, "%s",infostring);
9403 /* It's not safe to sdsfree() the returned string under memory
9404 * corruption conditions. Let it leak as we are going to abort */
9405
9406 trace_size = backtrace(trace, 100);
9407 /* overwrite sigaction with caller's address */
9408 if (getMcontextEip(uc) != NULL) {
9409 trace[1] = getMcontextEip(uc);
9410 }
9411 messages = backtrace_symbols(trace, trace_size);
9412
9413 for (i=1; i<trace_size; ++i) {
9414 char *fn = findFuncName(trace[i], &offset), *p;
9415
9416 p = strchr(messages[i],'+');
9417 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9418 redisLog(REDIS_WARNING,"%s", messages[i]);
9419 } else {
9420 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9421 }
9422 }
9423 /* free(messages); Don't call free() with possibly corrupted memory. */
9424 _exit(0);
9425 }
9426
9427 static void setupSigSegvAction(void) {
9428 struct sigaction act;
9429
9430 sigemptyset (&act.sa_mask);
9431 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9432 * is used. Otherwise, sa_handler is used */
9433 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9434 act.sa_sigaction = segvHandler;
9435 sigaction (SIGSEGV, &act, NULL);
9436 sigaction (SIGBUS, &act, NULL);
9437 sigaction (SIGFPE, &act, NULL);
9438 sigaction (SIGILL, &act, NULL);
9439 sigaction (SIGBUS, &act, NULL);
9440 return;
9441 }
9442
9443 #include "staticsymbols.h"
9444 /* This function try to convert a pointer into a function name. It's used in
9445 * oreder to provide a backtrace under segmentation fault that's able to
9446 * display functions declared as static (otherwise the backtrace is useless). */
9447 static char *findFuncName(void *pointer, unsigned long *offset){
9448 int i, ret = -1;
9449 unsigned long off, minoff = 0;
9450
9451 /* Try to match against the Symbol with the smallest offset */
9452 for (i=0; symsTable[i].pointer; i++) {
9453 unsigned long lp = (unsigned long) pointer;
9454
9455 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9456 off=lp-symsTable[i].pointer;
9457 if (ret < 0 || off < minoff) {
9458 minoff=off;
9459 ret=i;
9460 }
9461 }
9462 }
9463 if (ret == -1) return NULL;
9464 *offset = minoff;
9465 return symsTable[ret].name;
9466 }
9467 #else /* HAVE_BACKTRACE */
9468 static void setupSigSegvAction(void) {
9469 }
9470 #endif /* HAVE_BACKTRACE */
9471
9472
9473
9474 /* The End */
9475
9476
9477