]> git.saurik.com Git - redis.git/blob - redis.c
e4db385d601e63e6d4370b14bbe261168bd6e36e
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.4"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 /* Object types only used for dumping to disk */
131 #define REDIS_EXPIRETIME 253
132 #define REDIS_SELECTDB 254
133 #define REDIS_EOF 255
134
135 /* Defines related to the dump file format. To store 32 bits lengths for short
136 * keys requires a lot of space, so we check the most significant 2 bits of
137 * the first byte to interpreter the length:
138 *
139 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
140 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
141 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
142 * 11|000000 this means: specially encoded object will follow. The six bits
143 * number specify the kind of object that follows.
144 * See the REDIS_RDB_ENC_* defines.
145 *
146 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
147 * values, will fit inside. */
148 #define REDIS_RDB_6BITLEN 0
149 #define REDIS_RDB_14BITLEN 1
150 #define REDIS_RDB_32BITLEN 2
151 #define REDIS_RDB_ENCVAL 3
152 #define REDIS_RDB_LENERR UINT_MAX
153
154 /* When a length of a string object stored on disk has the first two bits
155 * set, the remaining two bits specify a special encoding for the object
156 * accordingly to the following defines: */
157 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
158 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
159 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
160 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
161
162 /* Virtual memory object->where field. */
163 #define REDIS_VM_MEMORY 0 /* The object is on memory */
164 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
165 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
166 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
167
168 /* Virtual memory static configuration stuff.
169 * Check vmFindContiguousPages() to know more about this magic numbers. */
170 #define REDIS_VM_MAX_NEAR_PAGES 65536
171 #define REDIS_VM_MAX_RANDOM_JUMP 4096
172 #define REDIS_VM_MAX_THREADS 32
173 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
174 /* The following is the *percentage* of completed I/O jobs to process when the
175 * handelr is called. While Virtual Memory I/O operations are performed by
176 * threads, this operations must be processed by the main thread when completed
177 * in order to take effect. */
178 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
179
180 /* Client flags */
181 #define REDIS_SLAVE 1 /* This client is a slave server */
182 #define REDIS_MASTER 2 /* This client is a master server */
183 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
184 #define REDIS_MULTI 8 /* This client is in a MULTI context */
185 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
186 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
187
188 /* Slave replication state - slave side */
189 #define REDIS_REPL_NONE 0 /* No active replication */
190 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
191 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
192
193 /* Slave replication state - from the point of view of master
194 * Note that in SEND_BULK and ONLINE state the slave receives new updates
195 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
196 * to start the next background saving in order to send updates to it. */
197 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
198 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
199 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
200 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
201
202 /* List related stuff */
203 #define REDIS_HEAD 0
204 #define REDIS_TAIL 1
205
206 /* Sort operations */
207 #define REDIS_SORT_GET 0
208 #define REDIS_SORT_ASC 1
209 #define REDIS_SORT_DESC 2
210 #define REDIS_SORTKEY_MAX 1024
211
212 /* Log levels */
213 #define REDIS_DEBUG 0
214 #define REDIS_VERBOSE 1
215 #define REDIS_NOTICE 2
216 #define REDIS_WARNING 3
217
218 /* Anti-warning macro... */
219 #define REDIS_NOTUSED(V) ((void) V)
220
221 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
222 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
223
224 /* Append only defines */
225 #define APPENDFSYNC_NO 0
226 #define APPENDFSYNC_ALWAYS 1
227 #define APPENDFSYNC_EVERYSEC 2
228
229 /* Hashes related defaults */
230 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
231 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
232
233 /* We can print the stacktrace, so our assert is defined this way: */
234 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
235 static void _redisAssert(char *estr, char *file, int line);
236
237 /*================================= Data types ============================== */
238
239 /* A redis object, that is a type able to hold a string / list / set */
240
241 /* The VM object structure */
242 struct redisObjectVM {
243 off_t page; /* the page at witch the object is stored on disk */
244 off_t usedpages; /* number of pages used on disk */
245 time_t atime; /* Last access time */
246 } vm;
247
248 /* The actual Redis Object */
249 typedef struct redisObject {
250 void *ptr;
251 unsigned char type;
252 unsigned char encoding;
253 unsigned char storage; /* If this object is a key, where is the value?
254 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
255 unsigned char vtype; /* If this object is a key, and value is swapped out,
256 * this is the type of the swapped out object. */
257 int refcount;
258 /* VM fields, this are only allocated if VM is active, otherwise the
259 * object allocation function will just allocate
260 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
261 * Redis without VM active will not have any overhead. */
262 struct redisObjectVM vm;
263 } robj;
264
265 /* Macro used to initalize a Redis object allocated on the stack.
266 * Note that this macro is taken near the structure definition to make sure
267 * we'll update it when the structure is changed, to avoid bugs like
268 * bug #85 introduced exactly in this way. */
269 #define initStaticStringObject(_var,_ptr) do { \
270 _var.refcount = 1; \
271 _var.type = REDIS_STRING; \
272 _var.encoding = REDIS_ENCODING_RAW; \
273 _var.ptr = _ptr; \
274 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
275 } while(0);
276
277 typedef struct redisDb {
278 dict *dict; /* The keyspace for this DB */
279 dict *expires; /* Timeout of keys with a timeout set */
280 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
281 dict *io_keys; /* Keys with clients waiting for VM I/O */
282 int id;
283 } redisDb;
284
285 /* Client MULTI/EXEC state */
286 typedef struct multiCmd {
287 robj **argv;
288 int argc;
289 struct redisCommand *cmd;
290 } multiCmd;
291
292 typedef struct multiState {
293 multiCmd *commands; /* Array of MULTI commands */
294 int count; /* Total number of MULTI commands */
295 } multiState;
296
297 /* With multiplexing we need to take per-clinet state.
298 * Clients are taken in a liked list. */
299 typedef struct redisClient {
300 int fd;
301 redisDb *db;
302 int dictid;
303 sds querybuf;
304 robj **argv, **mbargv;
305 int argc, mbargc;
306 int bulklen; /* bulk read len. -1 if not in bulk read mode */
307 int multibulk; /* multi bulk command format active */
308 list *reply;
309 int sentlen;
310 time_t lastinteraction; /* time of the last interaction, used for timeout */
311 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
312 int slaveseldb; /* slave selected db, if this client is a slave */
313 int authenticated; /* when requirepass is non-NULL */
314 int replstate; /* replication state if this is a slave */
315 int repldbfd; /* replication DB file descriptor */
316 long repldboff; /* replication DB file offset */
317 off_t repldbsize; /* replication DB file size */
318 multiState mstate; /* MULTI/EXEC state */
319 robj **blockingkeys; /* The key we are waiting to terminate a blocking
320 * operation such as BLPOP. Otherwise NULL. */
321 int blockingkeysnum; /* Number of blocking keys */
322 time_t blockingto; /* Blocking operation timeout. If UNIX current time
323 * is >= blockingto then the operation timed out. */
324 list *io_keys; /* Keys this client is waiting to be loaded from the
325 * swap file in order to continue. */
326 } redisClient;
327
328 struct saveparam {
329 time_t seconds;
330 int changes;
331 };
332
333 /* Global server state structure */
334 struct redisServer {
335 int port;
336 int fd;
337 redisDb *db;
338 dict *sharingpool; /* Poll used for object sharing */
339 unsigned int sharingpoolsize;
340 long long dirty; /* changes to DB from the last save */
341 list *clients;
342 list *slaves, *monitors;
343 char neterr[ANET_ERR_LEN];
344 aeEventLoop *el;
345 int cronloops; /* number of times the cron function run */
346 list *objfreelist; /* A list of freed objects to avoid malloc() */
347 time_t lastsave; /* Unix time of last save succeeede */
348 /* Fields used only for stats */
349 time_t stat_starttime; /* server start time */
350 long long stat_numcommands; /* number of processed commands */
351 long long stat_numconnections; /* number of connections received */
352 /* Configuration */
353 int verbosity;
354 int glueoutputbuf;
355 int maxidletime;
356 int dbnum;
357 int daemonize;
358 int appendonly;
359 int appendfsync;
360 time_t lastfsync;
361 int appendfd;
362 int appendseldb;
363 char *pidfile;
364 pid_t bgsavechildpid;
365 pid_t bgrewritechildpid;
366 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
367 struct saveparam *saveparams;
368 int saveparamslen;
369 char *logfile;
370 char *bindaddr;
371 char *dbfilename;
372 char *appendfilename;
373 char *requirepass;
374 int shareobjects;
375 int rdbcompression;
376 /* Replication related */
377 int isslave;
378 char *masterauth;
379 char *masterhost;
380 int masterport;
381 redisClient *master; /* client that is master for this slave */
382 int replstate;
383 unsigned int maxclients;
384 unsigned long long maxmemory;
385 unsigned int blpop_blocked_clients;
386 unsigned int vm_blocked_clients;
387 /* Sort parameters - qsort_r() is only available under BSD so we
388 * have to take this state global, in order to pass it to sortCompare() */
389 int sort_desc;
390 int sort_alpha;
391 int sort_bypattern;
392 /* Virtual memory configuration */
393 int vm_enabled;
394 char *vm_swap_file;
395 off_t vm_page_size;
396 off_t vm_pages;
397 unsigned long long vm_max_memory;
398 /* Hashes config */
399 size_t hash_max_zipmap_entries;
400 size_t hash_max_zipmap_value;
401 /* Virtual memory state */
402 FILE *vm_fp;
403 int vm_fd;
404 off_t vm_next_page; /* Next probably empty page */
405 off_t vm_near_pages; /* Number of pages allocated sequentially */
406 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
407 time_t unixtime; /* Unix time sampled every second. */
408 /* Virtual memory I/O threads stuff */
409 /* An I/O thread process an element taken from the io_jobs queue and
410 * put the result of the operation in the io_done list. While the
411 * job is being processed, it's put on io_processing queue. */
412 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
413 list *io_processing; /* List of VM I/O jobs being processed */
414 list *io_processed; /* List of VM I/O jobs already processed */
415 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
416 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
417 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
418 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
419 pthread_attr_t io_threads_attr; /* attributes for threads creation */
420 int io_active_threads; /* Number of running I/O threads */
421 int vm_max_threads; /* Max number of I/O threads running at the same time */
422 /* Our main thread is blocked on the event loop, locking for sockets ready
423 * to be read or written, so when a threaded I/O operation is ready to be
424 * processed by the main thread, the I/O thread will use a unix pipe to
425 * awake the main thread. The followings are the two pipe FDs. */
426 int io_ready_pipe_read;
427 int io_ready_pipe_write;
428 /* Virtual memory stats */
429 unsigned long long vm_stats_used_pages;
430 unsigned long long vm_stats_swapped_objects;
431 unsigned long long vm_stats_swapouts;
432 unsigned long long vm_stats_swapins;
433 FILE *devnull;
434 };
435
436 typedef void redisCommandProc(redisClient *c);
437 struct redisCommand {
438 char *name;
439 redisCommandProc *proc;
440 int arity;
441 int flags;
442 /* What keys should be loaded in background when calling this command? */
443 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
444 int vm_lastkey; /* THe last argument that's a key */
445 int vm_keystep; /* The step between first and last key */
446 };
447
448 struct redisFunctionSym {
449 char *name;
450 unsigned long pointer;
451 };
452
453 typedef struct _redisSortObject {
454 robj *obj;
455 union {
456 double score;
457 robj *cmpobj;
458 } u;
459 } redisSortObject;
460
461 typedef struct _redisSortOperation {
462 int type;
463 robj *pattern;
464 } redisSortOperation;
465
466 /* ZSETs use a specialized version of Skiplists */
467
468 typedef struct zskiplistNode {
469 struct zskiplistNode **forward;
470 struct zskiplistNode *backward;
471 unsigned int *span;
472 double score;
473 robj *obj;
474 } zskiplistNode;
475
476 typedef struct zskiplist {
477 struct zskiplistNode *header, *tail;
478 unsigned long length;
479 int level;
480 } zskiplist;
481
482 typedef struct zset {
483 dict *dict;
484 zskiplist *zsl;
485 } zset;
486
487 /* Our shared "common" objects */
488
489 struct sharedObjectsStruct {
490 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
491 *colon, *nullbulk, *nullmultibulk, *queued,
492 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
493 *outofrangeerr, *plus,
494 *select0, *select1, *select2, *select3, *select4,
495 *select5, *select6, *select7, *select8, *select9;
496 } shared;
497
498 /* Global vars that are actally used as constants. The following double
499 * values are used for double on-disk serialization, and are initialized
500 * at runtime to avoid strange compiler optimizations. */
501
502 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
503
504 /* VM threaded I/O request message */
505 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
506 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
507 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
508 typedef struct iojob {
509 int type; /* Request type, REDIS_IOJOB_* */
510 redisDb *db;/* Redis database */
511 robj *key; /* This I/O request is about swapping this key */
512 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
513 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
514 off_t page; /* Swap page where to read/write the object */
515 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
516 int canceled; /* True if this command was canceled by blocking side of VM */
517 pthread_t thread; /* ID of the thread processing this entry */
518 } iojob;
519
520 /*================================ Prototypes =============================== */
521
522 static void freeStringObject(robj *o);
523 static void freeListObject(robj *o);
524 static void freeSetObject(robj *o);
525 static void decrRefCount(void *o);
526 static robj *createObject(int type, void *ptr);
527 static void freeClient(redisClient *c);
528 static int rdbLoad(char *filename);
529 static void addReply(redisClient *c, robj *obj);
530 static void addReplySds(redisClient *c, sds s);
531 static void incrRefCount(robj *o);
532 static int rdbSaveBackground(char *filename);
533 static robj *createStringObject(char *ptr, size_t len);
534 static robj *dupStringObject(robj *o);
535 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
536 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
537 static int syncWithMaster(void);
538 static robj *tryObjectSharing(robj *o);
539 static int tryObjectEncoding(robj *o);
540 static robj *getDecodedObject(robj *o);
541 static int removeExpire(redisDb *db, robj *key);
542 static int expireIfNeeded(redisDb *db, robj *key);
543 static int deleteIfVolatile(redisDb *db, robj *key);
544 static int deleteIfSwapped(redisDb *db, robj *key);
545 static int deleteKey(redisDb *db, robj *key);
546 static time_t getExpire(redisDb *db, robj *key);
547 static int setExpire(redisDb *db, robj *key, time_t when);
548 static void updateSlavesWaitingBgsave(int bgsaveerr);
549 static void freeMemoryIfNeeded(void);
550 static int processCommand(redisClient *c);
551 static void setupSigSegvAction(void);
552 static void rdbRemoveTempFile(pid_t childpid);
553 static void aofRemoveTempFile(pid_t childpid);
554 static size_t stringObjectLen(robj *o);
555 static void processInputBuffer(redisClient *c);
556 static zskiplist *zslCreate(void);
557 static void zslFree(zskiplist *zsl);
558 static void zslInsert(zskiplist *zsl, double score, robj *obj);
559 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
560 static void initClientMultiState(redisClient *c);
561 static void freeClientMultiState(redisClient *c);
562 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
563 static void unblockClientWaitingData(redisClient *c);
564 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
565 static void vmInit(void);
566 static void vmMarkPagesFree(off_t page, off_t count);
567 static robj *vmLoadObject(robj *key);
568 static robj *vmPreviewObject(robj *key);
569 static int vmSwapOneObjectBlocking(void);
570 static int vmSwapOneObjectThreaded(void);
571 static int vmCanSwapOut(void);
572 static int tryFreeOneObjectFromFreelist(void);
573 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
574 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
575 static void vmCancelThreadedIOJob(robj *o);
576 static void lockThreadedIO(void);
577 static void unlockThreadedIO(void);
578 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
579 static void freeIOJob(iojob *j);
580 static void queueIOJob(iojob *j);
581 static int vmWriteObjectOnSwap(robj *o, off_t page);
582 static robj *vmReadObjectFromSwap(off_t page, int type);
583 static void waitEmptyIOJobsQueue(void);
584 static void vmReopenSwapFile(void);
585 static int vmFreePage(off_t page);
586 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
587 static int dontWaitForSwappedKey(redisClient *c, robj *key);
588 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
589 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
590 static struct redisCommand *lookupCommand(char *name);
591 static void call(redisClient *c, struct redisCommand *cmd);
592 static void resetClient(redisClient *c);
593
594 static void authCommand(redisClient *c);
595 static void pingCommand(redisClient *c);
596 static void echoCommand(redisClient *c);
597 static void setCommand(redisClient *c);
598 static void setnxCommand(redisClient *c);
599 static void getCommand(redisClient *c);
600 static void delCommand(redisClient *c);
601 static void existsCommand(redisClient *c);
602 static void incrCommand(redisClient *c);
603 static void decrCommand(redisClient *c);
604 static void incrbyCommand(redisClient *c);
605 static void decrbyCommand(redisClient *c);
606 static void selectCommand(redisClient *c);
607 static void randomkeyCommand(redisClient *c);
608 static void keysCommand(redisClient *c);
609 static void dbsizeCommand(redisClient *c);
610 static void lastsaveCommand(redisClient *c);
611 static void saveCommand(redisClient *c);
612 static void bgsaveCommand(redisClient *c);
613 static void bgrewriteaofCommand(redisClient *c);
614 static void shutdownCommand(redisClient *c);
615 static void moveCommand(redisClient *c);
616 static void renameCommand(redisClient *c);
617 static void renamenxCommand(redisClient *c);
618 static void lpushCommand(redisClient *c);
619 static void rpushCommand(redisClient *c);
620 static void lpopCommand(redisClient *c);
621 static void rpopCommand(redisClient *c);
622 static void llenCommand(redisClient *c);
623 static void lindexCommand(redisClient *c);
624 static void lrangeCommand(redisClient *c);
625 static void ltrimCommand(redisClient *c);
626 static void typeCommand(redisClient *c);
627 static void lsetCommand(redisClient *c);
628 static void saddCommand(redisClient *c);
629 static void sremCommand(redisClient *c);
630 static void smoveCommand(redisClient *c);
631 static void sismemberCommand(redisClient *c);
632 static void scardCommand(redisClient *c);
633 static void spopCommand(redisClient *c);
634 static void srandmemberCommand(redisClient *c);
635 static void sinterCommand(redisClient *c);
636 static void sinterstoreCommand(redisClient *c);
637 static void sunionCommand(redisClient *c);
638 static void sunionstoreCommand(redisClient *c);
639 static void sdiffCommand(redisClient *c);
640 static void sdiffstoreCommand(redisClient *c);
641 static void syncCommand(redisClient *c);
642 static void flushdbCommand(redisClient *c);
643 static void flushallCommand(redisClient *c);
644 static void sortCommand(redisClient *c);
645 static void lremCommand(redisClient *c);
646 static void rpoplpushcommand(redisClient *c);
647 static void infoCommand(redisClient *c);
648 static void mgetCommand(redisClient *c);
649 static void monitorCommand(redisClient *c);
650 static void expireCommand(redisClient *c);
651 static void expireatCommand(redisClient *c);
652 static void getsetCommand(redisClient *c);
653 static void ttlCommand(redisClient *c);
654 static void slaveofCommand(redisClient *c);
655 static void debugCommand(redisClient *c);
656 static void msetCommand(redisClient *c);
657 static void msetnxCommand(redisClient *c);
658 static void zaddCommand(redisClient *c);
659 static void zincrbyCommand(redisClient *c);
660 static void zrangeCommand(redisClient *c);
661 static void zrangebyscoreCommand(redisClient *c);
662 static void zcountCommand(redisClient *c);
663 static void zrevrangeCommand(redisClient *c);
664 static void zcardCommand(redisClient *c);
665 static void zremCommand(redisClient *c);
666 static void zscoreCommand(redisClient *c);
667 static void zremrangebyscoreCommand(redisClient *c);
668 static void multiCommand(redisClient *c);
669 static void execCommand(redisClient *c);
670 static void discardCommand(redisClient *c);
671 static void blpopCommand(redisClient *c);
672 static void brpopCommand(redisClient *c);
673 static void appendCommand(redisClient *c);
674 static void substrCommand(redisClient *c);
675 static void zrankCommand(redisClient *c);
676 static void hsetCommand(redisClient *c);
677 static void hgetCommand(redisClient *c);
678 static void zmergeCommand(redisClient *c);
679 static void zmergeweighedCommand(redisClient *c);
680
681 /*================================= Globals ================================= */
682
683 /* Global vars */
684 static struct redisServer server; /* server global state */
685 static struct redisCommand cmdTable[] = {
686 {"get",getCommand,2,REDIS_CMD_INLINE,1,1,1},
687 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
688 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
689 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
690 {"substr",substrCommand,4,REDIS_CMD_INLINE,1,1,1},
691 {"del",delCommand,-2,REDIS_CMD_INLINE,0,0,0},
692 {"exists",existsCommand,2,REDIS_CMD_INLINE,1,1,1},
693 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
694 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
695 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,1,-1,1},
696 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
697 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
698 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,1,1,1},
699 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,1,1,1},
700 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
701 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
702 {"llen",llenCommand,2,REDIS_CMD_INLINE,1,1,1},
703 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,1,1,1},
704 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
705 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,1,1,1},
706 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,1,1,1},
707 {"lrem",lremCommand,4,REDIS_CMD_BULK,1,1,1},
708 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,2,1},
709 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
710 {"srem",sremCommand,3,REDIS_CMD_BULK,1,1,1},
711 {"smove",smoveCommand,4,REDIS_CMD_BULK,1,2,1},
712 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,1,1,1},
713 {"scard",scardCommand,2,REDIS_CMD_INLINE,1,1,1},
714 {"spop",spopCommand,2,REDIS_CMD_INLINE,1,1,1},
715 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,1,1,1},
716 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
717 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
718 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
719 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
720 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
721 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
722 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,1,1,1},
723 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
724 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
725 {"zrem",zremCommand,3,REDIS_CMD_BULK,1,1,1},
726 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,1,1,1},
727 {"zmerge",zmergeCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
728 {"zmergeweighed",zmergeweighedCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-2,2},
729 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
730 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,1,1,1},
731 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,1,1,1},
732 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
733 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,1,1,1},
734 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
735 {"zrank",zrankCommand,3,REDIS_CMD_INLINE,1,1,1},
736 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
737 {"hget",hgetCommand,3,REDIS_CMD_BULK,1,1,1},
738 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
739 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
740 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
741 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
742 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
743 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,0,0,0},
744 {"select",selectCommand,2,REDIS_CMD_INLINE,0,0,0},
745 {"move",moveCommand,3,REDIS_CMD_INLINE,1,1,1},
746 {"rename",renameCommand,3,REDIS_CMD_INLINE,1,1,1},
747 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,1,1,1},
748 {"expire",expireCommand,3,REDIS_CMD_INLINE,0,0,0},
749 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,0,0,0},
750 {"keys",keysCommand,2,REDIS_CMD_INLINE,0,0,0},
751 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,0,0,0},
752 {"auth",authCommand,2,REDIS_CMD_INLINE,0,0,0},
753 {"ping",pingCommand,1,REDIS_CMD_INLINE,0,0,0},
754 {"echo",echoCommand,2,REDIS_CMD_BULK,0,0,0},
755 {"save",saveCommand,1,REDIS_CMD_INLINE,0,0,0},
756 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
757 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,0,0,0},
758 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,0,0,0},
759 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
760 {"type",typeCommand,2,REDIS_CMD_INLINE,1,1,1},
761 {"multi",multiCommand,1,REDIS_CMD_INLINE,0,0,0},
762 {"exec",execCommand,1,REDIS_CMD_INLINE,0,0,0},
763 {"discard",discardCommand,1,REDIS_CMD_INLINE,0,0,0},
764 {"sync",syncCommand,1,REDIS_CMD_INLINE,0,0,0},
765 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,0,0,0},
766 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,0,0,0},
767 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
768 {"info",infoCommand,1,REDIS_CMD_INLINE,0,0,0},
769 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,0,0,0},
770 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,1,1,1},
771 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,0,0,0},
772 {"debug",debugCommand,-2,REDIS_CMD_INLINE,0,0,0},
773 {NULL,NULL,0,0,0,0,0}
774 };
775
776 /*============================ Utility functions ============================ */
777
778 /* Glob-style pattern matching. */
779 int stringmatchlen(const char *pattern, int patternLen,
780 const char *string, int stringLen, int nocase)
781 {
782 while(patternLen) {
783 switch(pattern[0]) {
784 case '*':
785 while (pattern[1] == '*') {
786 pattern++;
787 patternLen--;
788 }
789 if (patternLen == 1)
790 return 1; /* match */
791 while(stringLen) {
792 if (stringmatchlen(pattern+1, patternLen-1,
793 string, stringLen, nocase))
794 return 1; /* match */
795 string++;
796 stringLen--;
797 }
798 return 0; /* no match */
799 break;
800 case '?':
801 if (stringLen == 0)
802 return 0; /* no match */
803 string++;
804 stringLen--;
805 break;
806 case '[':
807 {
808 int not, match;
809
810 pattern++;
811 patternLen--;
812 not = pattern[0] == '^';
813 if (not) {
814 pattern++;
815 patternLen--;
816 }
817 match = 0;
818 while(1) {
819 if (pattern[0] == '\\') {
820 pattern++;
821 patternLen--;
822 if (pattern[0] == string[0])
823 match = 1;
824 } else if (pattern[0] == ']') {
825 break;
826 } else if (patternLen == 0) {
827 pattern--;
828 patternLen++;
829 break;
830 } else if (pattern[1] == '-' && patternLen >= 3) {
831 int start = pattern[0];
832 int end = pattern[2];
833 int c = string[0];
834 if (start > end) {
835 int t = start;
836 start = end;
837 end = t;
838 }
839 if (nocase) {
840 start = tolower(start);
841 end = tolower(end);
842 c = tolower(c);
843 }
844 pattern += 2;
845 patternLen -= 2;
846 if (c >= start && c <= end)
847 match = 1;
848 } else {
849 if (!nocase) {
850 if (pattern[0] == string[0])
851 match = 1;
852 } else {
853 if (tolower((int)pattern[0]) == tolower((int)string[0]))
854 match = 1;
855 }
856 }
857 pattern++;
858 patternLen--;
859 }
860 if (not)
861 match = !match;
862 if (!match)
863 return 0; /* no match */
864 string++;
865 stringLen--;
866 break;
867 }
868 case '\\':
869 if (patternLen >= 2) {
870 pattern++;
871 patternLen--;
872 }
873 /* fall through */
874 default:
875 if (!nocase) {
876 if (pattern[0] != string[0])
877 return 0; /* no match */
878 } else {
879 if (tolower((int)pattern[0]) != tolower((int)string[0]))
880 return 0; /* no match */
881 }
882 string++;
883 stringLen--;
884 break;
885 }
886 pattern++;
887 patternLen--;
888 if (stringLen == 0) {
889 while(*pattern == '*') {
890 pattern++;
891 patternLen--;
892 }
893 break;
894 }
895 }
896 if (patternLen == 0 && stringLen == 0)
897 return 1;
898 return 0;
899 }
900
901 static void redisLog(int level, const char *fmt, ...) {
902 va_list ap;
903 FILE *fp;
904
905 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
906 if (!fp) return;
907
908 va_start(ap, fmt);
909 if (level >= server.verbosity) {
910 char *c = ".-*#";
911 char buf[64];
912 time_t now;
913
914 now = time(NULL);
915 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
916 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
917 vfprintf(fp, fmt, ap);
918 fprintf(fp,"\n");
919 fflush(fp);
920 }
921 va_end(ap);
922
923 if (server.logfile) fclose(fp);
924 }
925
926 /*====================== Hash table type implementation ==================== */
927
928 /* This is an hash table type that uses the SDS dynamic strings libary as
929 * keys and radis objects as values (objects can hold SDS strings,
930 * lists, sets). */
931
932 static void dictVanillaFree(void *privdata, void *val)
933 {
934 DICT_NOTUSED(privdata);
935 zfree(val);
936 }
937
938 static void dictListDestructor(void *privdata, void *val)
939 {
940 DICT_NOTUSED(privdata);
941 listRelease((list*)val);
942 }
943
944 static int sdsDictKeyCompare(void *privdata, const void *key1,
945 const void *key2)
946 {
947 int l1,l2;
948 DICT_NOTUSED(privdata);
949
950 l1 = sdslen((sds)key1);
951 l2 = sdslen((sds)key2);
952 if (l1 != l2) return 0;
953 return memcmp(key1, key2, l1) == 0;
954 }
955
956 static void dictRedisObjectDestructor(void *privdata, void *val)
957 {
958 DICT_NOTUSED(privdata);
959
960 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
961 decrRefCount(val);
962 }
963
964 static int dictObjKeyCompare(void *privdata, const void *key1,
965 const void *key2)
966 {
967 const robj *o1 = key1, *o2 = key2;
968 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
969 }
970
971 static unsigned int dictObjHash(const void *key) {
972 const robj *o = key;
973 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
974 }
975
976 static int dictEncObjKeyCompare(void *privdata, const void *key1,
977 const void *key2)
978 {
979 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
980 int cmp;
981
982 o1 = getDecodedObject(o1);
983 o2 = getDecodedObject(o2);
984 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
985 decrRefCount(o1);
986 decrRefCount(o2);
987 return cmp;
988 }
989
990 static unsigned int dictEncObjHash(const void *key) {
991 robj *o = (robj*) key;
992
993 if (o->encoding == REDIS_ENCODING_RAW) {
994 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
995 } else {
996 if (o->encoding == REDIS_ENCODING_INT) {
997 char buf[32];
998 int len;
999
1000 len = snprintf(buf,32,"%ld",(long)o->ptr);
1001 return dictGenHashFunction((unsigned char*)buf, len);
1002 } else {
1003 unsigned int hash;
1004
1005 o = getDecodedObject(o);
1006 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1007 decrRefCount(o);
1008 return hash;
1009 }
1010 }
1011 }
1012
1013 /* Sets type and expires */
1014 static dictType setDictType = {
1015 dictEncObjHash, /* hash function */
1016 NULL, /* key dup */
1017 NULL, /* val dup */
1018 dictEncObjKeyCompare, /* key compare */
1019 dictRedisObjectDestructor, /* key destructor */
1020 NULL /* val destructor */
1021 };
1022
1023 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1024 static dictType zsetDictType = {
1025 dictEncObjHash, /* hash function */
1026 NULL, /* key dup */
1027 NULL, /* val dup */
1028 dictEncObjKeyCompare, /* key compare */
1029 dictRedisObjectDestructor, /* key destructor */
1030 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1031 };
1032
1033 /* Db->dict */
1034 static dictType dbDictType = {
1035 dictObjHash, /* hash function */
1036 NULL, /* key dup */
1037 NULL, /* val dup */
1038 dictObjKeyCompare, /* key compare */
1039 dictRedisObjectDestructor, /* key destructor */
1040 dictRedisObjectDestructor /* val destructor */
1041 };
1042
1043 /* Db->expires */
1044 static dictType keyptrDictType = {
1045 dictObjHash, /* hash function */
1046 NULL, /* key dup */
1047 NULL, /* val dup */
1048 dictObjKeyCompare, /* key compare */
1049 dictRedisObjectDestructor, /* key destructor */
1050 NULL /* val destructor */
1051 };
1052
1053 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1054 static dictType hashDictType = {
1055 dictEncObjHash, /* hash function */
1056 NULL, /* key dup */
1057 NULL, /* val dup */
1058 dictEncObjKeyCompare, /* key compare */
1059 dictRedisObjectDestructor, /* key destructor */
1060 dictRedisObjectDestructor /* val destructor */
1061 };
1062
1063 /* Keylist hash table type has unencoded redis objects as keys and
1064 * lists as values. It's used for blocking operations (BLPOP) and to
1065 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1066 static dictType keylistDictType = {
1067 dictObjHash, /* hash function */
1068 NULL, /* key dup */
1069 NULL, /* val dup */
1070 dictObjKeyCompare, /* key compare */
1071 dictRedisObjectDestructor, /* key destructor */
1072 dictListDestructor /* val destructor */
1073 };
1074
1075 /* ========================= Random utility functions ======================= */
1076
1077 /* Redis generally does not try to recover from out of memory conditions
1078 * when allocating objects or strings, it is not clear if it will be possible
1079 * to report this condition to the client since the networking layer itself
1080 * is based on heap allocation for send buffers, so we simply abort.
1081 * At least the code will be simpler to read... */
1082 static void oom(const char *msg) {
1083 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1084 sleep(1);
1085 abort();
1086 }
1087
1088 /* ====================== Redis server networking stuff ===================== */
1089 static void closeTimedoutClients(void) {
1090 redisClient *c;
1091 listNode *ln;
1092 time_t now = time(NULL);
1093 listIter li;
1094
1095 listRewind(server.clients,&li);
1096 while ((ln = listNext(&li)) != NULL) {
1097 c = listNodeValue(ln);
1098 if (server.maxidletime &&
1099 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1100 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1101 (now - c->lastinteraction > server.maxidletime))
1102 {
1103 redisLog(REDIS_VERBOSE,"Closing idle client");
1104 freeClient(c);
1105 } else if (c->flags & REDIS_BLOCKED) {
1106 if (c->blockingto != 0 && c->blockingto < now) {
1107 addReply(c,shared.nullmultibulk);
1108 unblockClientWaitingData(c);
1109 }
1110 }
1111 }
1112 }
1113
1114 static int htNeedsResize(dict *dict) {
1115 long long size, used;
1116
1117 size = dictSlots(dict);
1118 used = dictSize(dict);
1119 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1120 (used*100/size < REDIS_HT_MINFILL));
1121 }
1122
1123 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1124 * we resize the hash table to save memory */
1125 static void tryResizeHashTables(void) {
1126 int j;
1127
1128 for (j = 0; j < server.dbnum; j++) {
1129 if (htNeedsResize(server.db[j].dict)) {
1130 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1131 dictResize(server.db[j].dict);
1132 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1133 }
1134 if (htNeedsResize(server.db[j].expires))
1135 dictResize(server.db[j].expires);
1136 }
1137 }
1138
1139 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1140 void backgroundSaveDoneHandler(int statloc) {
1141 int exitcode = WEXITSTATUS(statloc);
1142 int bysignal = WIFSIGNALED(statloc);
1143
1144 if (!bysignal && exitcode == 0) {
1145 redisLog(REDIS_NOTICE,
1146 "Background saving terminated with success");
1147 server.dirty = 0;
1148 server.lastsave = time(NULL);
1149 } else if (!bysignal && exitcode != 0) {
1150 redisLog(REDIS_WARNING, "Background saving error");
1151 } else {
1152 redisLog(REDIS_WARNING,
1153 "Background saving terminated by signal");
1154 rdbRemoveTempFile(server.bgsavechildpid);
1155 }
1156 server.bgsavechildpid = -1;
1157 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1158 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1159 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1160 }
1161
1162 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1163 * Handle this. */
1164 void backgroundRewriteDoneHandler(int statloc) {
1165 int exitcode = WEXITSTATUS(statloc);
1166 int bysignal = WIFSIGNALED(statloc);
1167
1168 if (!bysignal && exitcode == 0) {
1169 int fd;
1170 char tmpfile[256];
1171
1172 redisLog(REDIS_NOTICE,
1173 "Background append only file rewriting terminated with success");
1174 /* Now it's time to flush the differences accumulated by the parent */
1175 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1176 fd = open(tmpfile,O_WRONLY|O_APPEND);
1177 if (fd == -1) {
1178 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1179 goto cleanup;
1180 }
1181 /* Flush our data... */
1182 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1183 (signed) sdslen(server.bgrewritebuf)) {
1184 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1185 close(fd);
1186 goto cleanup;
1187 }
1188 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1189 /* Now our work is to rename the temp file into the stable file. And
1190 * switch the file descriptor used by the server for append only. */
1191 if (rename(tmpfile,server.appendfilename) == -1) {
1192 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1193 close(fd);
1194 goto cleanup;
1195 }
1196 /* Mission completed... almost */
1197 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1198 if (server.appendfd != -1) {
1199 /* If append only is actually enabled... */
1200 close(server.appendfd);
1201 server.appendfd = fd;
1202 fsync(fd);
1203 server.appendseldb = -1; /* Make sure it will issue SELECT */
1204 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1205 } else {
1206 /* If append only is disabled we just generate a dump in this
1207 * format. Why not? */
1208 close(fd);
1209 }
1210 } else if (!bysignal && exitcode != 0) {
1211 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1212 } else {
1213 redisLog(REDIS_WARNING,
1214 "Background append only file rewriting terminated by signal");
1215 }
1216 cleanup:
1217 sdsfree(server.bgrewritebuf);
1218 server.bgrewritebuf = sdsempty();
1219 aofRemoveTempFile(server.bgrewritechildpid);
1220 server.bgrewritechildpid = -1;
1221 }
1222
1223 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1224 int j, loops = server.cronloops++;
1225 REDIS_NOTUSED(eventLoop);
1226 REDIS_NOTUSED(id);
1227 REDIS_NOTUSED(clientData);
1228
1229 /* We take a cached value of the unix time in the global state because
1230 * with virtual memory and aging there is to store the current time
1231 * in objects at every object access, and accuracy is not needed.
1232 * To access a global var is faster than calling time(NULL) */
1233 server.unixtime = time(NULL);
1234
1235 /* Show some info about non-empty databases */
1236 for (j = 0; j < server.dbnum; j++) {
1237 long long size, used, vkeys;
1238
1239 size = dictSlots(server.db[j].dict);
1240 used = dictSize(server.db[j].dict);
1241 vkeys = dictSize(server.db[j].expires);
1242 if (!(loops % 5) && (used || vkeys)) {
1243 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1244 /* dictPrintStats(server.dict); */
1245 }
1246 }
1247
1248 /* We don't want to resize the hash tables while a bacground saving
1249 * is in progress: the saving child is created using fork() that is
1250 * implemented with a copy-on-write semantic in most modern systems, so
1251 * if we resize the HT while there is the saving child at work actually
1252 * a lot of memory movements in the parent will cause a lot of pages
1253 * copied. */
1254 if (server.bgsavechildpid == -1) tryResizeHashTables();
1255
1256 /* Show information about connected clients */
1257 if (!(loops % 5)) {
1258 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1259 listLength(server.clients)-listLength(server.slaves),
1260 listLength(server.slaves),
1261 zmalloc_used_memory(),
1262 dictSize(server.sharingpool));
1263 }
1264
1265 /* Close connections of timedout clients */
1266 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
1267 closeTimedoutClients();
1268
1269 /* Check if a background saving or AOF rewrite in progress terminated */
1270 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1271 int statloc;
1272 pid_t pid;
1273
1274 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1275 if (pid == server.bgsavechildpid) {
1276 backgroundSaveDoneHandler(statloc);
1277 } else {
1278 backgroundRewriteDoneHandler(statloc);
1279 }
1280 }
1281 } else {
1282 /* If there is not a background saving in progress check if
1283 * we have to save now */
1284 time_t now = time(NULL);
1285 for (j = 0; j < server.saveparamslen; j++) {
1286 struct saveparam *sp = server.saveparams+j;
1287
1288 if (server.dirty >= sp->changes &&
1289 now-server.lastsave > sp->seconds) {
1290 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1291 sp->changes, sp->seconds);
1292 rdbSaveBackground(server.dbfilename);
1293 break;
1294 }
1295 }
1296 }
1297
1298 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1299 * will use few CPU cycles if there are few expiring keys, otherwise
1300 * it will get more aggressive to avoid that too much memory is used by
1301 * keys that can be removed from the keyspace. */
1302 for (j = 0; j < server.dbnum; j++) {
1303 int expired;
1304 redisDb *db = server.db+j;
1305
1306 /* Continue to expire if at the end of the cycle more than 25%
1307 * of the keys were expired. */
1308 do {
1309 long num = dictSize(db->expires);
1310 time_t now = time(NULL);
1311
1312 expired = 0;
1313 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1314 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1315 while (num--) {
1316 dictEntry *de;
1317 time_t t;
1318
1319 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1320 t = (time_t) dictGetEntryVal(de);
1321 if (now > t) {
1322 deleteKey(db,dictGetEntryKey(de));
1323 expired++;
1324 }
1325 }
1326 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1327 }
1328
1329 /* Swap a few keys on disk if we are over the memory limit and VM
1330 * is enbled. Try to free objects from the free list first. */
1331 if (vmCanSwapOut()) {
1332 while (server.vm_enabled && zmalloc_used_memory() >
1333 server.vm_max_memory)
1334 {
1335 int retval;
1336
1337 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1338 retval = (server.vm_max_threads == 0) ?
1339 vmSwapOneObjectBlocking() :
1340 vmSwapOneObjectThreaded();
1341 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1342 zmalloc_used_memory() >
1343 (server.vm_max_memory+server.vm_max_memory/10))
1344 {
1345 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1346 }
1347 /* Note that when using threade I/O we free just one object,
1348 * because anyway when the I/O thread in charge to swap this
1349 * object out will finish, the handler of completed jobs
1350 * will try to swap more objects if we are still out of memory. */
1351 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1352 }
1353 }
1354
1355 /* Check if we should connect to a MASTER */
1356 if (server.replstate == REDIS_REPL_CONNECT) {
1357 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1358 if (syncWithMaster() == REDIS_OK) {
1359 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1360 }
1361 }
1362 return 1000;
1363 }
1364
1365 /* This function gets called every time Redis is entering the
1366 * main loop of the event driven library, that is, before to sleep
1367 * for ready file descriptors. */
1368 static void beforeSleep(struct aeEventLoop *eventLoop) {
1369 REDIS_NOTUSED(eventLoop);
1370
1371 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1372 listIter li;
1373 listNode *ln;
1374
1375 listRewind(server.io_ready_clients,&li);
1376 while((ln = listNext(&li))) {
1377 redisClient *c = ln->value;
1378 struct redisCommand *cmd;
1379
1380 /* Resume the client. */
1381 listDelNode(server.io_ready_clients,ln);
1382 c->flags &= (~REDIS_IO_WAIT);
1383 server.vm_blocked_clients--;
1384 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1385 readQueryFromClient, c);
1386 cmd = lookupCommand(c->argv[0]->ptr);
1387 assert(cmd != NULL);
1388 call(c,cmd);
1389 resetClient(c);
1390 /* There may be more data to process in the input buffer. */
1391 if (c->querybuf && sdslen(c->querybuf) > 0)
1392 processInputBuffer(c);
1393 }
1394 }
1395 }
1396
1397 static void createSharedObjects(void) {
1398 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1399 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1400 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1401 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1402 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1403 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1404 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1405 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1406 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1407 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1408 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1409 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1410 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1411 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1412 "-ERR no such key\r\n"));
1413 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1414 "-ERR syntax error\r\n"));
1415 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1416 "-ERR source and destination objects are the same\r\n"));
1417 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1418 "-ERR index out of range\r\n"));
1419 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1420 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1421 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1422 shared.select0 = createStringObject("select 0\r\n",10);
1423 shared.select1 = createStringObject("select 1\r\n",10);
1424 shared.select2 = createStringObject("select 2\r\n",10);
1425 shared.select3 = createStringObject("select 3\r\n",10);
1426 shared.select4 = createStringObject("select 4\r\n",10);
1427 shared.select5 = createStringObject("select 5\r\n",10);
1428 shared.select6 = createStringObject("select 6\r\n",10);
1429 shared.select7 = createStringObject("select 7\r\n",10);
1430 shared.select8 = createStringObject("select 8\r\n",10);
1431 shared.select9 = createStringObject("select 9\r\n",10);
1432 }
1433
1434 static void appendServerSaveParams(time_t seconds, int changes) {
1435 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1436 server.saveparams[server.saveparamslen].seconds = seconds;
1437 server.saveparams[server.saveparamslen].changes = changes;
1438 server.saveparamslen++;
1439 }
1440
1441 static void resetServerSaveParams() {
1442 zfree(server.saveparams);
1443 server.saveparams = NULL;
1444 server.saveparamslen = 0;
1445 }
1446
1447 static void initServerConfig() {
1448 server.dbnum = REDIS_DEFAULT_DBNUM;
1449 server.port = REDIS_SERVERPORT;
1450 server.verbosity = REDIS_VERBOSE;
1451 server.maxidletime = REDIS_MAXIDLETIME;
1452 server.saveparams = NULL;
1453 server.logfile = NULL; /* NULL = log on standard output */
1454 server.bindaddr = NULL;
1455 server.glueoutputbuf = 1;
1456 server.daemonize = 0;
1457 server.appendonly = 0;
1458 server.appendfsync = APPENDFSYNC_ALWAYS;
1459 server.lastfsync = time(NULL);
1460 server.appendfd = -1;
1461 server.appendseldb = -1; /* Make sure the first time will not match */
1462 server.pidfile = "/var/run/redis.pid";
1463 server.dbfilename = "dump.rdb";
1464 server.appendfilename = "appendonly.aof";
1465 server.requirepass = NULL;
1466 server.shareobjects = 0;
1467 server.rdbcompression = 1;
1468 server.sharingpoolsize = 1024;
1469 server.maxclients = 0;
1470 server.blpop_blocked_clients = 0;
1471 server.maxmemory = 0;
1472 server.vm_enabled = 0;
1473 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1474 server.vm_page_size = 256; /* 256 bytes per page */
1475 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1476 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1477 server.vm_max_threads = 4;
1478 server.vm_blocked_clients = 0;
1479 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1480 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1481
1482 resetServerSaveParams();
1483
1484 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1485 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1486 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1487 /* Replication related */
1488 server.isslave = 0;
1489 server.masterauth = NULL;
1490 server.masterhost = NULL;
1491 server.masterport = 6379;
1492 server.master = NULL;
1493 server.replstate = REDIS_REPL_NONE;
1494
1495 /* Double constants initialization */
1496 R_Zero = 0.0;
1497 R_PosInf = 1.0/R_Zero;
1498 R_NegInf = -1.0/R_Zero;
1499 R_Nan = R_Zero/R_Zero;
1500 }
1501
1502 static void initServer() {
1503 int j;
1504
1505 signal(SIGHUP, SIG_IGN);
1506 signal(SIGPIPE, SIG_IGN);
1507 setupSigSegvAction();
1508
1509 server.devnull = fopen("/dev/null","w");
1510 if (server.devnull == NULL) {
1511 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1512 exit(1);
1513 }
1514 server.clients = listCreate();
1515 server.slaves = listCreate();
1516 server.monitors = listCreate();
1517 server.objfreelist = listCreate();
1518 createSharedObjects();
1519 server.el = aeCreateEventLoop();
1520 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1521 server.sharingpool = dictCreate(&setDictType,NULL);
1522 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1523 if (server.fd == -1) {
1524 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1525 exit(1);
1526 }
1527 for (j = 0; j < server.dbnum; j++) {
1528 server.db[j].dict = dictCreate(&dbDictType,NULL);
1529 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1530 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1531 if (server.vm_enabled)
1532 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1533 server.db[j].id = j;
1534 }
1535 server.cronloops = 0;
1536 server.bgsavechildpid = -1;
1537 server.bgrewritechildpid = -1;
1538 server.bgrewritebuf = sdsempty();
1539 server.lastsave = time(NULL);
1540 server.dirty = 0;
1541 server.stat_numcommands = 0;
1542 server.stat_numconnections = 0;
1543 server.stat_starttime = time(NULL);
1544 server.unixtime = time(NULL);
1545 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1546 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1547 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1548
1549 if (server.appendonly) {
1550 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1551 if (server.appendfd == -1) {
1552 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1553 strerror(errno));
1554 exit(1);
1555 }
1556 }
1557
1558 if (server.vm_enabled) vmInit();
1559 }
1560
1561 /* Empty the whole database */
1562 static long long emptyDb() {
1563 int j;
1564 long long removed = 0;
1565
1566 for (j = 0; j < server.dbnum; j++) {
1567 removed += dictSize(server.db[j].dict);
1568 dictEmpty(server.db[j].dict);
1569 dictEmpty(server.db[j].expires);
1570 }
1571 return removed;
1572 }
1573
1574 static int yesnotoi(char *s) {
1575 if (!strcasecmp(s,"yes")) return 1;
1576 else if (!strcasecmp(s,"no")) return 0;
1577 else return -1;
1578 }
1579
1580 /* I agree, this is a very rudimental way to load a configuration...
1581 will improve later if the config gets more complex */
1582 static void loadServerConfig(char *filename) {
1583 FILE *fp;
1584 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1585 int linenum = 0;
1586 sds line = NULL;
1587
1588 if (filename[0] == '-' && filename[1] == '\0')
1589 fp = stdin;
1590 else {
1591 if ((fp = fopen(filename,"r")) == NULL) {
1592 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1593 exit(1);
1594 }
1595 }
1596
1597 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1598 sds *argv;
1599 int argc, j;
1600
1601 linenum++;
1602 line = sdsnew(buf);
1603 line = sdstrim(line," \t\r\n");
1604
1605 /* Skip comments and blank lines*/
1606 if (line[0] == '#' || line[0] == '\0') {
1607 sdsfree(line);
1608 continue;
1609 }
1610
1611 /* Split into arguments */
1612 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1613 sdstolower(argv[0]);
1614
1615 /* Execute config directives */
1616 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1617 server.maxidletime = atoi(argv[1]);
1618 if (server.maxidletime < 0) {
1619 err = "Invalid timeout value"; goto loaderr;
1620 }
1621 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1622 server.port = atoi(argv[1]);
1623 if (server.port < 1 || server.port > 65535) {
1624 err = "Invalid port"; goto loaderr;
1625 }
1626 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1627 server.bindaddr = zstrdup(argv[1]);
1628 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1629 int seconds = atoi(argv[1]);
1630 int changes = atoi(argv[2]);
1631 if (seconds < 1 || changes < 0) {
1632 err = "Invalid save parameters"; goto loaderr;
1633 }
1634 appendServerSaveParams(seconds,changes);
1635 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1636 if (chdir(argv[1]) == -1) {
1637 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1638 argv[1], strerror(errno));
1639 exit(1);
1640 }
1641 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1642 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1643 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1644 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1645 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1646 else {
1647 err = "Invalid log level. Must be one of debug, notice, warning";
1648 goto loaderr;
1649 }
1650 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1651 FILE *logfp;
1652
1653 server.logfile = zstrdup(argv[1]);
1654 if (!strcasecmp(server.logfile,"stdout")) {
1655 zfree(server.logfile);
1656 server.logfile = NULL;
1657 }
1658 if (server.logfile) {
1659 /* Test if we are able to open the file. The server will not
1660 * be able to abort just for this problem later... */
1661 logfp = fopen(server.logfile,"a");
1662 if (logfp == NULL) {
1663 err = sdscatprintf(sdsempty(),
1664 "Can't open the log file: %s", strerror(errno));
1665 goto loaderr;
1666 }
1667 fclose(logfp);
1668 }
1669 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1670 server.dbnum = atoi(argv[1]);
1671 if (server.dbnum < 1) {
1672 err = "Invalid number of databases"; goto loaderr;
1673 }
1674 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1675 server.maxclients = atoi(argv[1]);
1676 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1677 server.maxmemory = strtoll(argv[1], NULL, 10);
1678 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1679 server.masterhost = sdsnew(argv[1]);
1680 server.masterport = atoi(argv[2]);
1681 server.replstate = REDIS_REPL_CONNECT;
1682 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1683 server.masterauth = zstrdup(argv[1]);
1684 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1685 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1686 err = "argument must be 'yes' or 'no'"; goto loaderr;
1687 }
1688 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1689 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1690 err = "argument must be 'yes' or 'no'"; goto loaderr;
1691 }
1692 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1693 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1694 err = "argument must be 'yes' or 'no'"; goto loaderr;
1695 }
1696 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1697 server.sharingpoolsize = atoi(argv[1]);
1698 if (server.sharingpoolsize < 1) {
1699 err = "invalid object sharing pool size"; goto loaderr;
1700 }
1701 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1702 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1703 err = "argument must be 'yes' or 'no'"; goto loaderr;
1704 }
1705 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1706 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1707 err = "argument must be 'yes' or 'no'"; goto loaderr;
1708 }
1709 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1710 if (!strcasecmp(argv[1],"no")) {
1711 server.appendfsync = APPENDFSYNC_NO;
1712 } else if (!strcasecmp(argv[1],"always")) {
1713 server.appendfsync = APPENDFSYNC_ALWAYS;
1714 } else if (!strcasecmp(argv[1],"everysec")) {
1715 server.appendfsync = APPENDFSYNC_EVERYSEC;
1716 } else {
1717 err = "argument must be 'no', 'always' or 'everysec'";
1718 goto loaderr;
1719 }
1720 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1721 server.requirepass = zstrdup(argv[1]);
1722 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1723 server.pidfile = zstrdup(argv[1]);
1724 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1725 server.dbfilename = zstrdup(argv[1]);
1726 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1727 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1728 err = "argument must be 'yes' or 'no'"; goto loaderr;
1729 }
1730 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1731 zfree(server.vm_swap_file);
1732 server.vm_swap_file = zstrdup(argv[1]);
1733 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1734 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1735 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1736 server.vm_page_size = strtoll(argv[1], NULL, 10);
1737 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1738 server.vm_pages = strtoll(argv[1], NULL, 10);
1739 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1740 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1741 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1742 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1743 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1744 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1745 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1746 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1747 } else {
1748 err = "Bad directive or wrong number of arguments"; goto loaderr;
1749 }
1750 for (j = 0; j < argc; j++)
1751 sdsfree(argv[j]);
1752 zfree(argv);
1753 sdsfree(line);
1754 }
1755 if (fp != stdin) fclose(fp);
1756 return;
1757
1758 loaderr:
1759 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1760 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1761 fprintf(stderr, ">>> '%s'\n", line);
1762 fprintf(stderr, "%s\n", err);
1763 exit(1);
1764 }
1765
1766 static void freeClientArgv(redisClient *c) {
1767 int j;
1768
1769 for (j = 0; j < c->argc; j++)
1770 decrRefCount(c->argv[j]);
1771 for (j = 0; j < c->mbargc; j++)
1772 decrRefCount(c->mbargv[j]);
1773 c->argc = 0;
1774 c->mbargc = 0;
1775 }
1776
1777 static void freeClient(redisClient *c) {
1778 listNode *ln;
1779
1780 /* Note that if the client we are freeing is blocked into a blocking
1781 * call, we have to set querybuf to NULL *before* to call
1782 * unblockClientWaitingData() to avoid processInputBuffer() will get
1783 * called. Also it is important to remove the file events after
1784 * this, because this call adds the READABLE event. */
1785 sdsfree(c->querybuf);
1786 c->querybuf = NULL;
1787 if (c->flags & REDIS_BLOCKED)
1788 unblockClientWaitingData(c);
1789
1790 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1791 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1792 listRelease(c->reply);
1793 freeClientArgv(c);
1794 close(c->fd);
1795 /* Remove from the list of clients */
1796 ln = listSearchKey(server.clients,c);
1797 redisAssert(ln != NULL);
1798 listDelNode(server.clients,ln);
1799 /* Remove from the list of clients waiting for swapped keys */
1800 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1801 ln = listSearchKey(server.io_ready_clients,c);
1802 if (ln) {
1803 listDelNode(server.io_ready_clients,ln);
1804 server.vm_blocked_clients--;
1805 }
1806 }
1807 while (server.vm_enabled && listLength(c->io_keys)) {
1808 ln = listFirst(c->io_keys);
1809 dontWaitForSwappedKey(c,ln->value);
1810 }
1811 listRelease(c->io_keys);
1812 /* Other cleanup */
1813 if (c->flags & REDIS_SLAVE) {
1814 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1815 close(c->repldbfd);
1816 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1817 ln = listSearchKey(l,c);
1818 redisAssert(ln != NULL);
1819 listDelNode(l,ln);
1820 }
1821 if (c->flags & REDIS_MASTER) {
1822 server.master = NULL;
1823 server.replstate = REDIS_REPL_CONNECT;
1824 }
1825 zfree(c->argv);
1826 zfree(c->mbargv);
1827 freeClientMultiState(c);
1828 zfree(c);
1829 }
1830
1831 #define GLUEREPLY_UP_TO (1024)
1832 static void glueReplyBuffersIfNeeded(redisClient *c) {
1833 int copylen = 0;
1834 char buf[GLUEREPLY_UP_TO];
1835 listNode *ln;
1836 listIter li;
1837 robj *o;
1838
1839 listRewind(c->reply,&li);
1840 while((ln = listNext(&li))) {
1841 int objlen;
1842
1843 o = ln->value;
1844 objlen = sdslen(o->ptr);
1845 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1846 memcpy(buf+copylen,o->ptr,objlen);
1847 copylen += objlen;
1848 listDelNode(c->reply,ln);
1849 } else {
1850 if (copylen == 0) return;
1851 break;
1852 }
1853 }
1854 /* Now the output buffer is empty, add the new single element */
1855 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1856 listAddNodeHead(c->reply,o);
1857 }
1858
1859 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1860 redisClient *c = privdata;
1861 int nwritten = 0, totwritten = 0, objlen;
1862 robj *o;
1863 REDIS_NOTUSED(el);
1864 REDIS_NOTUSED(mask);
1865
1866 /* Use writev() if we have enough buffers to send */
1867 if (!server.glueoutputbuf &&
1868 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1869 !(c->flags & REDIS_MASTER))
1870 {
1871 sendReplyToClientWritev(el, fd, privdata, mask);
1872 return;
1873 }
1874
1875 while(listLength(c->reply)) {
1876 if (server.glueoutputbuf && listLength(c->reply) > 1)
1877 glueReplyBuffersIfNeeded(c);
1878
1879 o = listNodeValue(listFirst(c->reply));
1880 objlen = sdslen(o->ptr);
1881
1882 if (objlen == 0) {
1883 listDelNode(c->reply,listFirst(c->reply));
1884 continue;
1885 }
1886
1887 if (c->flags & REDIS_MASTER) {
1888 /* Don't reply to a master */
1889 nwritten = objlen - c->sentlen;
1890 } else {
1891 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1892 if (nwritten <= 0) break;
1893 }
1894 c->sentlen += nwritten;
1895 totwritten += nwritten;
1896 /* If we fully sent the object on head go to the next one */
1897 if (c->sentlen == objlen) {
1898 listDelNode(c->reply,listFirst(c->reply));
1899 c->sentlen = 0;
1900 }
1901 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1902 * bytes, in a single threaded server it's a good idea to serve
1903 * other clients as well, even if a very large request comes from
1904 * super fast link that is always able to accept data (in real world
1905 * scenario think about 'KEYS *' against the loopback interfae) */
1906 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1907 }
1908 if (nwritten == -1) {
1909 if (errno == EAGAIN) {
1910 nwritten = 0;
1911 } else {
1912 redisLog(REDIS_VERBOSE,
1913 "Error writing to client: %s", strerror(errno));
1914 freeClient(c);
1915 return;
1916 }
1917 }
1918 if (totwritten > 0) c->lastinteraction = time(NULL);
1919 if (listLength(c->reply) == 0) {
1920 c->sentlen = 0;
1921 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1922 }
1923 }
1924
1925 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1926 {
1927 redisClient *c = privdata;
1928 int nwritten = 0, totwritten = 0, objlen, willwrite;
1929 robj *o;
1930 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1931 int offset, ion = 0;
1932 REDIS_NOTUSED(el);
1933 REDIS_NOTUSED(mask);
1934
1935 listNode *node;
1936 while (listLength(c->reply)) {
1937 offset = c->sentlen;
1938 ion = 0;
1939 willwrite = 0;
1940
1941 /* fill-in the iov[] array */
1942 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1943 o = listNodeValue(node);
1944 objlen = sdslen(o->ptr);
1945
1946 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1947 break;
1948
1949 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1950 break; /* no more iovecs */
1951
1952 iov[ion].iov_base = ((char*)o->ptr) + offset;
1953 iov[ion].iov_len = objlen - offset;
1954 willwrite += objlen - offset;
1955 offset = 0; /* just for the first item */
1956 ion++;
1957 }
1958
1959 if(willwrite == 0)
1960 break;
1961
1962 /* write all collected blocks at once */
1963 if((nwritten = writev(fd, iov, ion)) < 0) {
1964 if (errno != EAGAIN) {
1965 redisLog(REDIS_VERBOSE,
1966 "Error writing to client: %s", strerror(errno));
1967 freeClient(c);
1968 return;
1969 }
1970 break;
1971 }
1972
1973 totwritten += nwritten;
1974 offset = c->sentlen;
1975
1976 /* remove written robjs from c->reply */
1977 while (nwritten && listLength(c->reply)) {
1978 o = listNodeValue(listFirst(c->reply));
1979 objlen = sdslen(o->ptr);
1980
1981 if(nwritten >= objlen - offset) {
1982 listDelNode(c->reply, listFirst(c->reply));
1983 nwritten -= objlen - offset;
1984 c->sentlen = 0;
1985 } else {
1986 /* partial write */
1987 c->sentlen += nwritten;
1988 break;
1989 }
1990 offset = 0;
1991 }
1992 }
1993
1994 if (totwritten > 0)
1995 c->lastinteraction = time(NULL);
1996
1997 if (listLength(c->reply) == 0) {
1998 c->sentlen = 0;
1999 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2000 }
2001 }
2002
2003 static struct redisCommand *lookupCommand(char *name) {
2004 int j = 0;
2005 while(cmdTable[j].name != NULL) {
2006 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2007 j++;
2008 }
2009 return NULL;
2010 }
2011
2012 /* resetClient prepare the client to process the next command */
2013 static void resetClient(redisClient *c) {
2014 freeClientArgv(c);
2015 c->bulklen = -1;
2016 c->multibulk = 0;
2017 }
2018
2019 /* Call() is the core of Redis execution of a command */
2020 static void call(redisClient *c, struct redisCommand *cmd) {
2021 long long dirty;
2022
2023 dirty = server.dirty;
2024 cmd->proc(c);
2025 if (server.appendonly && server.dirty-dirty)
2026 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2027 if (server.dirty-dirty && listLength(server.slaves))
2028 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2029 if (listLength(server.monitors))
2030 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2031 server.stat_numcommands++;
2032 }
2033
2034 /* If this function gets called we already read a whole
2035 * command, argments are in the client argv/argc fields.
2036 * processCommand() execute the command or prepare the
2037 * server for a bulk read from the client.
2038 *
2039 * If 1 is returned the client is still alive and valid and
2040 * and other operations can be performed by the caller. Otherwise
2041 * if 0 is returned the client was destroied (i.e. after QUIT). */
2042 static int processCommand(redisClient *c) {
2043 struct redisCommand *cmd;
2044
2045 /* Free some memory if needed (maxmemory setting) */
2046 if (server.maxmemory) freeMemoryIfNeeded();
2047
2048 /* Handle the multi bulk command type. This is an alternative protocol
2049 * supported by Redis in order to receive commands that are composed of
2050 * multiple binary-safe "bulk" arguments. The latency of processing is
2051 * a bit higher but this allows things like multi-sets, so if this
2052 * protocol is used only for MSET and similar commands this is a big win. */
2053 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2054 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2055 if (c->multibulk <= 0) {
2056 resetClient(c);
2057 return 1;
2058 } else {
2059 decrRefCount(c->argv[c->argc-1]);
2060 c->argc--;
2061 return 1;
2062 }
2063 } else if (c->multibulk) {
2064 if (c->bulklen == -1) {
2065 if (((char*)c->argv[0]->ptr)[0] != '$') {
2066 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2067 resetClient(c);
2068 return 1;
2069 } else {
2070 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2071 decrRefCount(c->argv[0]);
2072 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2073 c->argc--;
2074 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2075 resetClient(c);
2076 return 1;
2077 }
2078 c->argc--;
2079 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2080 return 1;
2081 }
2082 } else {
2083 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2084 c->mbargv[c->mbargc] = c->argv[0];
2085 c->mbargc++;
2086 c->argc--;
2087 c->multibulk--;
2088 if (c->multibulk == 0) {
2089 robj **auxargv;
2090 int auxargc;
2091
2092 /* Here we need to swap the multi-bulk argc/argv with the
2093 * normal argc/argv of the client structure. */
2094 auxargv = c->argv;
2095 c->argv = c->mbargv;
2096 c->mbargv = auxargv;
2097
2098 auxargc = c->argc;
2099 c->argc = c->mbargc;
2100 c->mbargc = auxargc;
2101
2102 /* We need to set bulklen to something different than -1
2103 * in order for the code below to process the command without
2104 * to try to read the last argument of a bulk command as
2105 * a special argument. */
2106 c->bulklen = 0;
2107 /* continue below and process the command */
2108 } else {
2109 c->bulklen = -1;
2110 return 1;
2111 }
2112 }
2113 }
2114 /* -- end of multi bulk commands processing -- */
2115
2116 /* The QUIT command is handled as a special case. Normal command
2117 * procs are unable to close the client connection safely */
2118 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2119 freeClient(c);
2120 return 0;
2121 }
2122
2123 /* Now lookup the command and check ASAP about trivial error conditions
2124 * such wrong arity, bad command name and so forth. */
2125 cmd = lookupCommand(c->argv[0]->ptr);
2126 if (!cmd) {
2127 addReplySds(c,
2128 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2129 (char*)c->argv[0]->ptr));
2130 resetClient(c);
2131 return 1;
2132 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2133 (c->argc < -cmd->arity)) {
2134 addReplySds(c,
2135 sdscatprintf(sdsempty(),
2136 "-ERR wrong number of arguments for '%s' command\r\n",
2137 cmd->name));
2138 resetClient(c);
2139 return 1;
2140 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2141 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2142 resetClient(c);
2143 return 1;
2144 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2145 /* This is a bulk command, we have to read the last argument yet. */
2146 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2147
2148 decrRefCount(c->argv[c->argc-1]);
2149 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2150 c->argc--;
2151 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2152 resetClient(c);
2153 return 1;
2154 }
2155 c->argc--;
2156 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2157 /* It is possible that the bulk read is already in the
2158 * buffer. Check this condition and handle it accordingly.
2159 * This is just a fast path, alternative to call processInputBuffer().
2160 * It's a good idea since the code is small and this condition
2161 * happens most of the times. */
2162 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2163 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2164 c->argc++;
2165 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2166 } else {
2167 /* Otherwise return... there is to read the last argument
2168 * from the socket. */
2169 return 1;
2170 }
2171 }
2172 /* Let's try to share objects on the command arguments vector */
2173 if (server.shareobjects) {
2174 int j;
2175 for(j = 1; j < c->argc; j++)
2176 c->argv[j] = tryObjectSharing(c->argv[j]);
2177 }
2178 /* Let's try to encode the bulk object to save space. */
2179 if (cmd->flags & REDIS_CMD_BULK)
2180 tryObjectEncoding(c->argv[c->argc-1]);
2181
2182 /* Check if the user is authenticated */
2183 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2184 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2185 resetClient(c);
2186 return 1;
2187 }
2188
2189 /* Exec the command */
2190 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2191 queueMultiCommand(c,cmd);
2192 addReply(c,shared.queued);
2193 } else {
2194 if (server.vm_enabled && server.vm_max_threads > 0 &&
2195 blockClientOnSwappedKeys(cmd,c)) return 1;
2196 call(c,cmd);
2197 }
2198
2199 /* Prepare the client for the next command */
2200 resetClient(c);
2201 return 1;
2202 }
2203
2204 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2205 listNode *ln;
2206 listIter li;
2207 int outc = 0, j;
2208 robj **outv;
2209 /* (args*2)+1 is enough room for args, spaces, newlines */
2210 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2211
2212 if (argc <= REDIS_STATIC_ARGS) {
2213 outv = static_outv;
2214 } else {
2215 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2216 }
2217
2218 for (j = 0; j < argc; j++) {
2219 if (j != 0) outv[outc++] = shared.space;
2220 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2221 robj *lenobj;
2222
2223 lenobj = createObject(REDIS_STRING,
2224 sdscatprintf(sdsempty(),"%lu\r\n",
2225 (unsigned long) stringObjectLen(argv[j])));
2226 lenobj->refcount = 0;
2227 outv[outc++] = lenobj;
2228 }
2229 outv[outc++] = argv[j];
2230 }
2231 outv[outc++] = shared.crlf;
2232
2233 /* Increment all the refcounts at start and decrement at end in order to
2234 * be sure to free objects if there is no slave in a replication state
2235 * able to be feed with commands */
2236 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2237 listRewind(slaves,&li);
2238 while((ln = listNext(&li))) {
2239 redisClient *slave = ln->value;
2240
2241 /* Don't feed slaves that are still waiting for BGSAVE to start */
2242 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2243
2244 /* Feed all the other slaves, MONITORs and so on */
2245 if (slave->slaveseldb != dictid) {
2246 robj *selectcmd;
2247
2248 switch(dictid) {
2249 case 0: selectcmd = shared.select0; break;
2250 case 1: selectcmd = shared.select1; break;
2251 case 2: selectcmd = shared.select2; break;
2252 case 3: selectcmd = shared.select3; break;
2253 case 4: selectcmd = shared.select4; break;
2254 case 5: selectcmd = shared.select5; break;
2255 case 6: selectcmd = shared.select6; break;
2256 case 7: selectcmd = shared.select7; break;
2257 case 8: selectcmd = shared.select8; break;
2258 case 9: selectcmd = shared.select9; break;
2259 default:
2260 selectcmd = createObject(REDIS_STRING,
2261 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2262 selectcmd->refcount = 0;
2263 break;
2264 }
2265 addReply(slave,selectcmd);
2266 slave->slaveseldb = dictid;
2267 }
2268 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2269 }
2270 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2271 if (outv != static_outv) zfree(outv);
2272 }
2273
2274 static void processInputBuffer(redisClient *c) {
2275 again:
2276 /* Before to process the input buffer, make sure the client is not
2277 * waitig for a blocking operation such as BLPOP. Note that the first
2278 * iteration the client is never blocked, otherwise the processInputBuffer
2279 * would not be called at all, but after the execution of the first commands
2280 * in the input buffer the client may be blocked, and the "goto again"
2281 * will try to reiterate. The following line will make it return asap. */
2282 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2283 if (c->bulklen == -1) {
2284 /* Read the first line of the query */
2285 char *p = strchr(c->querybuf,'\n');
2286 size_t querylen;
2287
2288 if (p) {
2289 sds query, *argv;
2290 int argc, j;
2291
2292 query = c->querybuf;
2293 c->querybuf = sdsempty();
2294 querylen = 1+(p-(query));
2295 if (sdslen(query) > querylen) {
2296 /* leave data after the first line of the query in the buffer */
2297 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2298 }
2299 *p = '\0'; /* remove "\n" */
2300 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2301 sdsupdatelen(query);
2302
2303 /* Now we can split the query in arguments */
2304 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2305 sdsfree(query);
2306
2307 if (c->argv) zfree(c->argv);
2308 c->argv = zmalloc(sizeof(robj*)*argc);
2309
2310 for (j = 0; j < argc; j++) {
2311 if (sdslen(argv[j])) {
2312 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2313 c->argc++;
2314 } else {
2315 sdsfree(argv[j]);
2316 }
2317 }
2318 zfree(argv);
2319 if (c->argc) {
2320 /* Execute the command. If the client is still valid
2321 * after processCommand() return and there is something
2322 * on the query buffer try to process the next command. */
2323 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2324 } else {
2325 /* Nothing to process, argc == 0. Just process the query
2326 * buffer if it's not empty or return to the caller */
2327 if (sdslen(c->querybuf)) goto again;
2328 }
2329 return;
2330 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2331 redisLog(REDIS_VERBOSE, "Client protocol error");
2332 freeClient(c);
2333 return;
2334 }
2335 } else {
2336 /* Bulk read handling. Note that if we are at this point
2337 the client already sent a command terminated with a newline,
2338 we are reading the bulk data that is actually the last
2339 argument of the command. */
2340 int qbl = sdslen(c->querybuf);
2341
2342 if (c->bulklen <= qbl) {
2343 /* Copy everything but the final CRLF as final argument */
2344 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2345 c->argc++;
2346 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2347 /* Process the command. If the client is still valid after
2348 * the processing and there is more data in the buffer
2349 * try to parse it. */
2350 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2351 return;
2352 }
2353 }
2354 }
2355
2356 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2357 redisClient *c = (redisClient*) privdata;
2358 char buf[REDIS_IOBUF_LEN];
2359 int nread;
2360 REDIS_NOTUSED(el);
2361 REDIS_NOTUSED(mask);
2362
2363 nread = read(fd, buf, REDIS_IOBUF_LEN);
2364 if (nread == -1) {
2365 if (errno == EAGAIN) {
2366 nread = 0;
2367 } else {
2368 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2369 freeClient(c);
2370 return;
2371 }
2372 } else if (nread == 0) {
2373 redisLog(REDIS_VERBOSE, "Client closed connection");
2374 freeClient(c);
2375 return;
2376 }
2377 if (nread) {
2378 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2379 c->lastinteraction = time(NULL);
2380 } else {
2381 return;
2382 }
2383 if (!(c->flags & REDIS_BLOCKED))
2384 processInputBuffer(c);
2385 }
2386
2387 static int selectDb(redisClient *c, int id) {
2388 if (id < 0 || id >= server.dbnum)
2389 return REDIS_ERR;
2390 c->db = &server.db[id];
2391 return REDIS_OK;
2392 }
2393
2394 static void *dupClientReplyValue(void *o) {
2395 incrRefCount((robj*)o);
2396 return o;
2397 }
2398
2399 static redisClient *createClient(int fd) {
2400 redisClient *c = zmalloc(sizeof(*c));
2401
2402 anetNonBlock(NULL,fd);
2403 anetTcpNoDelay(NULL,fd);
2404 if (!c) return NULL;
2405 selectDb(c,0);
2406 c->fd = fd;
2407 c->querybuf = sdsempty();
2408 c->argc = 0;
2409 c->argv = NULL;
2410 c->bulklen = -1;
2411 c->multibulk = 0;
2412 c->mbargc = 0;
2413 c->mbargv = NULL;
2414 c->sentlen = 0;
2415 c->flags = 0;
2416 c->lastinteraction = time(NULL);
2417 c->authenticated = 0;
2418 c->replstate = REDIS_REPL_NONE;
2419 c->reply = listCreate();
2420 listSetFreeMethod(c->reply,decrRefCount);
2421 listSetDupMethod(c->reply,dupClientReplyValue);
2422 c->blockingkeys = NULL;
2423 c->blockingkeysnum = 0;
2424 c->io_keys = listCreate();
2425 listSetFreeMethod(c->io_keys,decrRefCount);
2426 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2427 readQueryFromClient, c) == AE_ERR) {
2428 freeClient(c);
2429 return NULL;
2430 }
2431 listAddNodeTail(server.clients,c);
2432 initClientMultiState(c);
2433 return c;
2434 }
2435
2436 static void addReply(redisClient *c, robj *obj) {
2437 if (listLength(c->reply) == 0 &&
2438 (c->replstate == REDIS_REPL_NONE ||
2439 c->replstate == REDIS_REPL_ONLINE) &&
2440 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2441 sendReplyToClient, c) == AE_ERR) return;
2442
2443 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2444 obj = dupStringObject(obj);
2445 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2446 }
2447 listAddNodeTail(c->reply,getDecodedObject(obj));
2448 }
2449
2450 static void addReplySds(redisClient *c, sds s) {
2451 robj *o = createObject(REDIS_STRING,s);
2452 addReply(c,o);
2453 decrRefCount(o);
2454 }
2455
2456 static void addReplyDouble(redisClient *c, double d) {
2457 char buf[128];
2458
2459 snprintf(buf,sizeof(buf),"%.17g",d);
2460 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2461 (unsigned long) strlen(buf),buf));
2462 }
2463
2464 static void addReplyLong(redisClient *c, long l) {
2465 char buf[128];
2466 size_t len;
2467
2468 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2469 addReplySds(c,sdsnewlen(buf,len));
2470 }
2471
2472 static void addReplyBulkLen(redisClient *c, robj *obj) {
2473 size_t len;
2474
2475 if (obj->encoding == REDIS_ENCODING_RAW) {
2476 len = sdslen(obj->ptr);
2477 } else {
2478 long n = (long)obj->ptr;
2479
2480 /* Compute how many bytes will take this integer as a radix 10 string */
2481 len = 1;
2482 if (n < 0) {
2483 len++;
2484 n = -n;
2485 }
2486 while((n = n/10) != 0) {
2487 len++;
2488 }
2489 }
2490 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2491 }
2492
2493 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2494 int cport, cfd;
2495 char cip[128];
2496 redisClient *c;
2497 REDIS_NOTUSED(el);
2498 REDIS_NOTUSED(mask);
2499 REDIS_NOTUSED(privdata);
2500
2501 cfd = anetAccept(server.neterr, fd, cip, &cport);
2502 if (cfd == AE_ERR) {
2503 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2504 return;
2505 }
2506 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2507 if ((c = createClient(cfd)) == NULL) {
2508 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2509 close(cfd); /* May be already closed, just ingore errors */
2510 return;
2511 }
2512 /* If maxclient directive is set and this is one client more... close the
2513 * connection. Note that we create the client instead to check before
2514 * for this condition, since now the socket is already set in nonblocking
2515 * mode and we can send an error for free using the Kernel I/O */
2516 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2517 char *err = "-ERR max number of clients reached\r\n";
2518
2519 /* That's a best effort error message, don't check write errors */
2520 if (write(c->fd,err,strlen(err)) == -1) {
2521 /* Nothing to do, Just to avoid the warning... */
2522 }
2523 freeClient(c);
2524 return;
2525 }
2526 server.stat_numconnections++;
2527 }
2528
2529 /* ======================= Redis objects implementation ===================== */
2530
2531 static robj *createObject(int type, void *ptr) {
2532 robj *o;
2533
2534 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2535 if (listLength(server.objfreelist)) {
2536 listNode *head = listFirst(server.objfreelist);
2537 o = listNodeValue(head);
2538 listDelNode(server.objfreelist,head);
2539 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2540 } else {
2541 if (server.vm_enabled) {
2542 pthread_mutex_unlock(&server.obj_freelist_mutex);
2543 o = zmalloc(sizeof(*o));
2544 } else {
2545 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2546 }
2547 }
2548 o->type = type;
2549 o->encoding = REDIS_ENCODING_RAW;
2550 o->ptr = ptr;
2551 o->refcount = 1;
2552 if (server.vm_enabled) {
2553 /* Note that this code may run in the context of an I/O thread
2554 * and accessing to server.unixtime in theory is an error
2555 * (no locks). But in practice this is safe, and even if we read
2556 * garbage Redis will not fail, as it's just a statistical info */
2557 o->vm.atime = server.unixtime;
2558 o->storage = REDIS_VM_MEMORY;
2559 }
2560 return o;
2561 }
2562
2563 static robj *createStringObject(char *ptr, size_t len) {
2564 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2565 }
2566
2567 static robj *dupStringObject(robj *o) {
2568 assert(o->encoding == REDIS_ENCODING_RAW);
2569 return createStringObject(o->ptr,sdslen(o->ptr));
2570 }
2571
2572 static robj *createListObject(void) {
2573 list *l = listCreate();
2574
2575 listSetFreeMethod(l,decrRefCount);
2576 return createObject(REDIS_LIST,l);
2577 }
2578
2579 static robj *createSetObject(void) {
2580 dict *d = dictCreate(&setDictType,NULL);
2581 return createObject(REDIS_SET,d);
2582 }
2583
2584 static robj *createHashObject(void) {
2585 /* All the Hashes start as zipmaps. Will be automatically converted
2586 * into hash tables if there are enough elements or big elements
2587 * inside. */
2588 unsigned char *zm = zipmapNew();
2589 robj *o = createObject(REDIS_HASH,zm);
2590 o->encoding = REDIS_ENCODING_ZIPMAP;
2591 return o;
2592 }
2593
2594 static robj *createZsetObject(void) {
2595 zset *zs = zmalloc(sizeof(*zs));
2596
2597 zs->dict = dictCreate(&zsetDictType,NULL);
2598 zs->zsl = zslCreate();
2599 return createObject(REDIS_ZSET,zs);
2600 }
2601
2602 static void freeStringObject(robj *o) {
2603 if (o->encoding == REDIS_ENCODING_RAW) {
2604 sdsfree(o->ptr);
2605 }
2606 }
2607
2608 static void freeListObject(robj *o) {
2609 listRelease((list*) o->ptr);
2610 }
2611
2612 static void freeSetObject(robj *o) {
2613 dictRelease((dict*) o->ptr);
2614 }
2615
2616 static void freeZsetObject(robj *o) {
2617 zset *zs = o->ptr;
2618
2619 dictRelease(zs->dict);
2620 zslFree(zs->zsl);
2621 zfree(zs);
2622 }
2623
2624 static void freeHashObject(robj *o) {
2625 switch (o->encoding) {
2626 case REDIS_ENCODING_HT:
2627 dictRelease((dict*) o->ptr);
2628 break;
2629 case REDIS_ENCODING_ZIPMAP:
2630 zfree(o->ptr);
2631 break;
2632 default:
2633 redisAssert(0);
2634 break;
2635 }
2636 }
2637
2638 static void incrRefCount(robj *o) {
2639 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2640 o->refcount++;
2641 }
2642
2643 static void decrRefCount(void *obj) {
2644 robj *o = obj;
2645
2646 /* Object is a key of a swapped out value, or in the process of being
2647 * loaded. */
2648 if (server.vm_enabled &&
2649 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2650 {
2651 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2652 redisAssert(o->refcount == 1);
2653 }
2654 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2655 redisAssert(o->type == REDIS_STRING);
2656 freeStringObject(o);
2657 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2658 pthread_mutex_lock(&server.obj_freelist_mutex);
2659 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2660 !listAddNodeHead(server.objfreelist,o))
2661 zfree(o);
2662 pthread_mutex_unlock(&server.obj_freelist_mutex);
2663 server.vm_stats_swapped_objects--;
2664 return;
2665 }
2666 /* Object is in memory, or in the process of being swapped out. */
2667 if (--(o->refcount) == 0) {
2668 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2669 vmCancelThreadedIOJob(obj);
2670 switch(o->type) {
2671 case REDIS_STRING: freeStringObject(o); break;
2672 case REDIS_LIST: freeListObject(o); break;
2673 case REDIS_SET: freeSetObject(o); break;
2674 case REDIS_ZSET: freeZsetObject(o); break;
2675 case REDIS_HASH: freeHashObject(o); break;
2676 default: redisAssert(0 != 0); break;
2677 }
2678 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2679 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2680 !listAddNodeHead(server.objfreelist,o))
2681 zfree(o);
2682 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2683 }
2684 }
2685
2686 static robj *lookupKey(redisDb *db, robj *key) {
2687 dictEntry *de = dictFind(db->dict,key);
2688 if (de) {
2689 robj *key = dictGetEntryKey(de);
2690 robj *val = dictGetEntryVal(de);
2691
2692 if (server.vm_enabled) {
2693 if (key->storage == REDIS_VM_MEMORY ||
2694 key->storage == REDIS_VM_SWAPPING)
2695 {
2696 /* If we were swapping the object out, stop it, this key
2697 * was requested. */
2698 if (key->storage == REDIS_VM_SWAPPING)
2699 vmCancelThreadedIOJob(key);
2700 /* Update the access time of the key for the aging algorithm. */
2701 key->vm.atime = server.unixtime;
2702 } else {
2703 int notify = (key->storage == REDIS_VM_LOADING);
2704
2705 /* Our value was swapped on disk. Bring it at home. */
2706 redisAssert(val == NULL);
2707 val = vmLoadObject(key);
2708 dictGetEntryVal(de) = val;
2709
2710 /* Clients blocked by the VM subsystem may be waiting for
2711 * this key... */
2712 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2713 }
2714 }
2715 return val;
2716 } else {
2717 return NULL;
2718 }
2719 }
2720
2721 static robj *lookupKeyRead(redisDb *db, robj *key) {
2722 expireIfNeeded(db,key);
2723 return lookupKey(db,key);
2724 }
2725
2726 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2727 deleteIfVolatile(db,key);
2728 return lookupKey(db,key);
2729 }
2730
2731 static int deleteKey(redisDb *db, robj *key) {
2732 int retval;
2733
2734 /* We need to protect key from destruction: after the first dictDelete()
2735 * it may happen that 'key' is no longer valid if we don't increment
2736 * it's count. This may happen when we get the object reference directly
2737 * from the hash table with dictRandomKey() or dict iterators */
2738 incrRefCount(key);
2739 if (dictSize(db->expires)) dictDelete(db->expires,key);
2740 retval = dictDelete(db->dict,key);
2741 decrRefCount(key);
2742
2743 return retval == DICT_OK;
2744 }
2745
2746 /* Try to share an object against the shared objects pool */
2747 static robj *tryObjectSharing(robj *o) {
2748 struct dictEntry *de;
2749 unsigned long c;
2750
2751 if (o == NULL || server.shareobjects == 0) return o;
2752
2753 redisAssert(o->type == REDIS_STRING);
2754 de = dictFind(server.sharingpool,o);
2755 if (de) {
2756 robj *shared = dictGetEntryKey(de);
2757
2758 c = ((unsigned long) dictGetEntryVal(de))+1;
2759 dictGetEntryVal(de) = (void*) c;
2760 incrRefCount(shared);
2761 decrRefCount(o);
2762 return shared;
2763 } else {
2764 /* Here we are using a stream algorihtm: Every time an object is
2765 * shared we increment its count, everytime there is a miss we
2766 * recrement the counter of a random object. If this object reaches
2767 * zero we remove the object and put the current object instead. */
2768 if (dictSize(server.sharingpool) >=
2769 server.sharingpoolsize) {
2770 de = dictGetRandomKey(server.sharingpool);
2771 redisAssert(de != NULL);
2772 c = ((unsigned long) dictGetEntryVal(de))-1;
2773 dictGetEntryVal(de) = (void*) c;
2774 if (c == 0) {
2775 dictDelete(server.sharingpool,de->key);
2776 }
2777 } else {
2778 c = 0; /* If the pool is empty we want to add this object */
2779 }
2780 if (c == 0) {
2781 int retval;
2782
2783 retval = dictAdd(server.sharingpool,o,(void*)1);
2784 redisAssert(retval == DICT_OK);
2785 incrRefCount(o);
2786 }
2787 return o;
2788 }
2789 }
2790
2791 /* Check if the nul-terminated string 's' can be represented by a long
2792 * (that is, is a number that fits into long without any other space or
2793 * character before or after the digits).
2794 *
2795 * If so, the function returns REDIS_OK and *longval is set to the value
2796 * of the number. Otherwise REDIS_ERR is returned */
2797 static int isStringRepresentableAsLong(sds s, long *longval) {
2798 char buf[32], *endptr;
2799 long value;
2800 int slen;
2801
2802 value = strtol(s, &endptr, 10);
2803 if (endptr[0] != '\0') return REDIS_ERR;
2804 slen = snprintf(buf,32,"%ld",value);
2805
2806 /* If the number converted back into a string is not identical
2807 * then it's not possible to encode the string as integer */
2808 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2809 if (longval) *longval = value;
2810 return REDIS_OK;
2811 }
2812
2813 /* Try to encode a string object in order to save space */
2814 static int tryObjectEncoding(robj *o) {
2815 long value;
2816 sds s = o->ptr;
2817
2818 if (o->encoding != REDIS_ENCODING_RAW)
2819 return REDIS_ERR; /* Already encoded */
2820
2821 /* It's not save to encode shared objects: shared objects can be shared
2822 * everywhere in the "object space" of Redis. Encoded objects can only
2823 * appear as "values" (and not, for instance, as keys) */
2824 if (o->refcount > 1) return REDIS_ERR;
2825
2826 /* Currently we try to encode only strings */
2827 redisAssert(o->type == REDIS_STRING);
2828
2829 /* Check if we can represent this string as a long integer */
2830 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2831
2832 /* Ok, this object can be encoded */
2833 o->encoding = REDIS_ENCODING_INT;
2834 sdsfree(o->ptr);
2835 o->ptr = (void*) value;
2836 return REDIS_OK;
2837 }
2838
2839 /* Get a decoded version of an encoded object (returned as a new object).
2840 * If the object is already raw-encoded just increment the ref count. */
2841 static robj *getDecodedObject(robj *o) {
2842 robj *dec;
2843
2844 if (o->encoding == REDIS_ENCODING_RAW) {
2845 incrRefCount(o);
2846 return o;
2847 }
2848 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2849 char buf[32];
2850
2851 snprintf(buf,32,"%ld",(long)o->ptr);
2852 dec = createStringObject(buf,strlen(buf));
2853 return dec;
2854 } else {
2855 redisAssert(1 != 1);
2856 }
2857 }
2858
2859 /* Compare two string objects via strcmp() or alike.
2860 * Note that the objects may be integer-encoded. In such a case we
2861 * use snprintf() to get a string representation of the numbers on the stack
2862 * and compare the strings, it's much faster than calling getDecodedObject().
2863 *
2864 * Important note: if objects are not integer encoded, but binary-safe strings,
2865 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2866 * binary safe. */
2867 static int compareStringObjects(robj *a, robj *b) {
2868 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2869 char bufa[128], bufb[128], *astr, *bstr;
2870 int bothsds = 1;
2871
2872 if (a == b) return 0;
2873 if (a->encoding != REDIS_ENCODING_RAW) {
2874 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2875 astr = bufa;
2876 bothsds = 0;
2877 } else {
2878 astr = a->ptr;
2879 }
2880 if (b->encoding != REDIS_ENCODING_RAW) {
2881 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2882 bstr = bufb;
2883 bothsds = 0;
2884 } else {
2885 bstr = b->ptr;
2886 }
2887 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
2888 }
2889
2890 static size_t stringObjectLen(robj *o) {
2891 redisAssert(o->type == REDIS_STRING);
2892 if (o->encoding == REDIS_ENCODING_RAW) {
2893 return sdslen(o->ptr);
2894 } else {
2895 char buf[32];
2896
2897 return snprintf(buf,32,"%ld",(long)o->ptr);
2898 }
2899 }
2900
2901 /*============================ RDB saving/loading =========================== */
2902
2903 static int rdbSaveType(FILE *fp, unsigned char type) {
2904 if (fwrite(&type,1,1,fp) == 0) return -1;
2905 return 0;
2906 }
2907
2908 static int rdbSaveTime(FILE *fp, time_t t) {
2909 int32_t t32 = (int32_t) t;
2910 if (fwrite(&t32,4,1,fp) == 0) return -1;
2911 return 0;
2912 }
2913
2914 /* check rdbLoadLen() comments for more info */
2915 static int rdbSaveLen(FILE *fp, uint32_t len) {
2916 unsigned char buf[2];
2917
2918 if (len < (1<<6)) {
2919 /* Save a 6 bit len */
2920 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
2921 if (fwrite(buf,1,1,fp) == 0) return -1;
2922 } else if (len < (1<<14)) {
2923 /* Save a 14 bit len */
2924 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
2925 buf[1] = len&0xFF;
2926 if (fwrite(buf,2,1,fp) == 0) return -1;
2927 } else {
2928 /* Save a 32 bit len */
2929 buf[0] = (REDIS_RDB_32BITLEN<<6);
2930 if (fwrite(buf,1,1,fp) == 0) return -1;
2931 len = htonl(len);
2932 if (fwrite(&len,4,1,fp) == 0) return -1;
2933 }
2934 return 0;
2935 }
2936
2937 /* String objects in the form "2391" "-100" without any space and with a
2938 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2939 * encoded as integers to save space */
2940 static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
2941 long long value;
2942 char *endptr, buf[32];
2943
2944 /* Check if it's possible to encode this value as a number */
2945 value = strtoll(s, &endptr, 10);
2946 if (endptr[0] != '\0') return 0;
2947 snprintf(buf,32,"%lld",value);
2948
2949 /* If the number converted back into a string is not identical
2950 * then it's not possible to encode the string as integer */
2951 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2952
2953 /* Finally check if it fits in our ranges */
2954 if (value >= -(1<<7) && value <= (1<<7)-1) {
2955 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2956 enc[1] = value&0xFF;
2957 return 2;
2958 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2959 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2960 enc[1] = value&0xFF;
2961 enc[2] = (value>>8)&0xFF;
2962 return 3;
2963 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2964 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2965 enc[1] = value&0xFF;
2966 enc[2] = (value>>8)&0xFF;
2967 enc[3] = (value>>16)&0xFF;
2968 enc[4] = (value>>24)&0xFF;
2969 return 5;
2970 } else {
2971 return 0;
2972 }
2973 }
2974
2975 static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2976 unsigned int comprlen, outlen;
2977 unsigned char byte;
2978 void *out;
2979
2980 /* We require at least four bytes compression for this to be worth it */
2981 outlen = sdslen(obj->ptr)-4;
2982 if (outlen <= 0) return 0;
2983 if ((out = zmalloc(outlen+1)) == NULL) return 0;
2984 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2985 if (comprlen == 0) {
2986 zfree(out);
2987 return 0;
2988 }
2989 /* Data compressed! Let's save it on disk */
2990 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2991 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2992 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2993 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2994 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
2995 zfree(out);
2996 return comprlen;
2997
2998 writeerr:
2999 zfree(out);
3000 return -1;
3001 }
3002
3003 /* Save a string objet as [len][data] on disk. If the object is a string
3004 * representation of an integer value we try to safe it in a special form */
3005 static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
3006 size_t len;
3007 int enclen;
3008
3009 len = sdslen(obj->ptr);
3010
3011 /* Try integer encoding */
3012 if (len <= 11) {
3013 unsigned char buf[5];
3014 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
3015 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3016 return 0;
3017 }
3018 }
3019
3020 /* Try LZF compression - under 20 bytes it's unable to compress even
3021 * aaaaaaaaaaaaaaaaaa so skip it */
3022 if (server.rdbcompression && len > 20) {
3023 int retval;
3024
3025 retval = rdbSaveLzfStringObject(fp,obj);
3026 if (retval == -1) return -1;
3027 if (retval > 0) return 0;
3028 /* retval == 0 means data can't be compressed, save the old way */
3029 }
3030
3031 /* Store verbatim */
3032 if (rdbSaveLen(fp,len) == -1) return -1;
3033 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
3034 return 0;
3035 }
3036
3037 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3038 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3039 int retval;
3040
3041 /* Avoid incr/decr ref count business when possible.
3042 * This plays well with copy-on-write given that we are probably
3043 * in a child process (BGSAVE). Also this makes sure key objects
3044 * of swapped objects are not incRefCount-ed (an assert does not allow
3045 * this in order to avoid bugs) */
3046 if (obj->encoding != REDIS_ENCODING_RAW) {
3047 obj = getDecodedObject(obj);
3048 retval = rdbSaveStringObjectRaw(fp,obj);
3049 decrRefCount(obj);
3050 } else {
3051 retval = rdbSaveStringObjectRaw(fp,obj);
3052 }
3053 return retval;
3054 }
3055
3056 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3057 * 8 bit integer specifing the length of the representation.
3058 * This 8 bit integer has special values in order to specify the following
3059 * conditions:
3060 * 253: not a number
3061 * 254: + inf
3062 * 255: - inf
3063 */
3064 static int rdbSaveDoubleValue(FILE *fp, double val) {
3065 unsigned char buf[128];
3066 int len;
3067
3068 if (isnan(val)) {
3069 buf[0] = 253;
3070 len = 1;
3071 } else if (!isfinite(val)) {
3072 len = 1;
3073 buf[0] = (val < 0) ? 255 : 254;
3074 } else {
3075 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3076 buf[0] = strlen((char*)buf+1);
3077 len = buf[0]+1;
3078 }
3079 if (fwrite(buf,len,1,fp) == 0) return -1;
3080 return 0;
3081 }
3082
3083 /* Save a Redis object. */
3084 static int rdbSaveObject(FILE *fp, robj *o) {
3085 if (o->type == REDIS_STRING) {
3086 /* Save a string value */
3087 if (rdbSaveStringObject(fp,o) == -1) return -1;
3088 } else if (o->type == REDIS_LIST) {
3089 /* Save a list value */
3090 list *list = o->ptr;
3091 listIter li;
3092 listNode *ln;
3093
3094 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3095 listRewind(list,&li);
3096 while((ln = listNext(&li))) {
3097 robj *eleobj = listNodeValue(ln);
3098
3099 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3100 }
3101 } else if (o->type == REDIS_SET) {
3102 /* Save a set value */
3103 dict *set = o->ptr;
3104 dictIterator *di = dictGetIterator(set);
3105 dictEntry *de;
3106
3107 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3108 while((de = dictNext(di)) != NULL) {
3109 robj *eleobj = dictGetEntryKey(de);
3110
3111 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3112 }
3113 dictReleaseIterator(di);
3114 } else if (o->type == REDIS_ZSET) {
3115 /* Save a set value */
3116 zset *zs = o->ptr;
3117 dictIterator *di = dictGetIterator(zs->dict);
3118 dictEntry *de;
3119
3120 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3121 while((de = dictNext(di)) != NULL) {
3122 robj *eleobj = dictGetEntryKey(de);
3123 double *score = dictGetEntryVal(de);
3124
3125 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3126 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3127 }
3128 dictReleaseIterator(di);
3129 } else {
3130 redisAssert(0 != 0);
3131 }
3132 return 0;
3133 }
3134
3135 /* Return the length the object will have on disk if saved with
3136 * the rdbSaveObject() function. Currently we use a trick to get
3137 * this length with very little changes to the code. In the future
3138 * we could switch to a faster solution. */
3139 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3140 if (fp == NULL) fp = server.devnull;
3141 rewind(fp);
3142 assert(rdbSaveObject(fp,o) != 1);
3143 return ftello(fp);
3144 }
3145
3146 /* Return the number of pages required to save this object in the swap file */
3147 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3148 off_t bytes = rdbSavedObjectLen(o,fp);
3149
3150 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3151 }
3152
3153 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3154 static int rdbSave(char *filename) {
3155 dictIterator *di = NULL;
3156 dictEntry *de;
3157 FILE *fp;
3158 char tmpfile[256];
3159 int j;
3160 time_t now = time(NULL);
3161
3162 /* Wait for I/O therads to terminate, just in case this is a
3163 * foreground-saving, to avoid seeking the swap file descriptor at the
3164 * same time. */
3165 if (server.vm_enabled)
3166 waitEmptyIOJobsQueue();
3167
3168 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3169 fp = fopen(tmpfile,"w");
3170 if (!fp) {
3171 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3172 return REDIS_ERR;
3173 }
3174 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3175 for (j = 0; j < server.dbnum; j++) {
3176 redisDb *db = server.db+j;
3177 dict *d = db->dict;
3178 if (dictSize(d) == 0) continue;
3179 di = dictGetIterator(d);
3180 if (!di) {
3181 fclose(fp);
3182 return REDIS_ERR;
3183 }
3184
3185 /* Write the SELECT DB opcode */
3186 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3187 if (rdbSaveLen(fp,j) == -1) goto werr;
3188
3189 /* Iterate this DB writing every entry */
3190 while((de = dictNext(di)) != NULL) {
3191 robj *key = dictGetEntryKey(de);
3192 robj *o = dictGetEntryVal(de);
3193 time_t expiretime = getExpire(db,key);
3194
3195 /* Save the expire time */
3196 if (expiretime != -1) {
3197 /* If this key is already expired skip it */
3198 if (expiretime < now) continue;
3199 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3200 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3201 }
3202 /* Save the key and associated value. This requires special
3203 * handling if the value is swapped out. */
3204 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3205 key->storage == REDIS_VM_SWAPPING) {
3206 /* Save type, key, value */
3207 if (rdbSaveType(fp,o->type) == -1) goto werr;
3208 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3209 if (rdbSaveObject(fp,o) == -1) goto werr;
3210 } else {
3211 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3212 robj *po;
3213 /* Get a preview of the object in memory */
3214 po = vmPreviewObject(key);
3215 /* Save type, key, value */
3216 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3217 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3218 if (rdbSaveObject(fp,po) == -1) goto werr;
3219 /* Remove the loaded object from memory */
3220 decrRefCount(po);
3221 }
3222 }
3223 dictReleaseIterator(di);
3224 }
3225 /* EOF opcode */
3226 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3227
3228 /* Make sure data will not remain on the OS's output buffers */
3229 fflush(fp);
3230 fsync(fileno(fp));
3231 fclose(fp);
3232
3233 /* Use RENAME to make sure the DB file is changed atomically only
3234 * if the generate DB file is ok. */
3235 if (rename(tmpfile,filename) == -1) {
3236 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3237 unlink(tmpfile);
3238 return REDIS_ERR;
3239 }
3240 redisLog(REDIS_NOTICE,"DB saved on disk");
3241 server.dirty = 0;
3242 server.lastsave = time(NULL);
3243 return REDIS_OK;
3244
3245 werr:
3246 fclose(fp);
3247 unlink(tmpfile);
3248 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3249 if (di) dictReleaseIterator(di);
3250 return REDIS_ERR;
3251 }
3252
3253 static int rdbSaveBackground(char *filename) {
3254 pid_t childpid;
3255
3256 if (server.bgsavechildpid != -1) return REDIS_ERR;
3257 if (server.vm_enabled) waitEmptyIOJobsQueue();
3258 if ((childpid = fork()) == 0) {
3259 /* Child */
3260 if (server.vm_enabled) vmReopenSwapFile();
3261 close(server.fd);
3262 if (rdbSave(filename) == REDIS_OK) {
3263 _exit(0);
3264 } else {
3265 _exit(1);
3266 }
3267 } else {
3268 /* Parent */
3269 if (childpid == -1) {
3270 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3271 strerror(errno));
3272 return REDIS_ERR;
3273 }
3274 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3275 server.bgsavechildpid = childpid;
3276 return REDIS_OK;
3277 }
3278 return REDIS_OK; /* unreached */
3279 }
3280
3281 static void rdbRemoveTempFile(pid_t childpid) {
3282 char tmpfile[256];
3283
3284 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3285 unlink(tmpfile);
3286 }
3287
3288 static int rdbLoadType(FILE *fp) {
3289 unsigned char type;
3290 if (fread(&type,1,1,fp) == 0) return -1;
3291 return type;
3292 }
3293
3294 static time_t rdbLoadTime(FILE *fp) {
3295 int32_t t32;
3296 if (fread(&t32,4,1,fp) == 0) return -1;
3297 return (time_t) t32;
3298 }
3299
3300 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3301 * of this file for a description of how this are stored on disk.
3302 *
3303 * isencoded is set to 1 if the readed length is not actually a length but
3304 * an "encoding type", check the above comments for more info */
3305 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3306 unsigned char buf[2];
3307 uint32_t len;
3308 int type;
3309
3310 if (isencoded) *isencoded = 0;
3311 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3312 type = (buf[0]&0xC0)>>6;
3313 if (type == REDIS_RDB_6BITLEN) {
3314 /* Read a 6 bit len */
3315 return buf[0]&0x3F;
3316 } else if (type == REDIS_RDB_ENCVAL) {
3317 /* Read a 6 bit len encoding type */
3318 if (isencoded) *isencoded = 1;
3319 return buf[0]&0x3F;
3320 } else if (type == REDIS_RDB_14BITLEN) {
3321 /* Read a 14 bit len */
3322 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3323 return ((buf[0]&0x3F)<<8)|buf[1];
3324 } else {
3325 /* Read a 32 bit len */
3326 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3327 return ntohl(len);
3328 }
3329 }
3330
3331 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3332 unsigned char enc[4];
3333 long long val;
3334
3335 if (enctype == REDIS_RDB_ENC_INT8) {
3336 if (fread(enc,1,1,fp) == 0) return NULL;
3337 val = (signed char)enc[0];
3338 } else if (enctype == REDIS_RDB_ENC_INT16) {
3339 uint16_t v;
3340 if (fread(enc,2,1,fp) == 0) return NULL;
3341 v = enc[0]|(enc[1]<<8);
3342 val = (int16_t)v;
3343 } else if (enctype == REDIS_RDB_ENC_INT32) {
3344 uint32_t v;
3345 if (fread(enc,4,1,fp) == 0) return NULL;
3346 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3347 val = (int32_t)v;
3348 } else {
3349 val = 0; /* anti-warning */
3350 redisAssert(0!=0);
3351 }
3352 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3353 }
3354
3355 static robj *rdbLoadLzfStringObject(FILE*fp) {
3356 unsigned int len, clen;
3357 unsigned char *c = NULL;
3358 sds val = NULL;
3359
3360 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3361 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3362 if ((c = zmalloc(clen)) == NULL) goto err;
3363 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3364 if (fread(c,clen,1,fp) == 0) goto err;
3365 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3366 zfree(c);
3367 return createObject(REDIS_STRING,val);
3368 err:
3369 zfree(c);
3370 sdsfree(val);
3371 return NULL;
3372 }
3373
3374 static robj *rdbLoadStringObject(FILE*fp) {
3375 int isencoded;
3376 uint32_t len;
3377 sds val;
3378
3379 len = rdbLoadLen(fp,&isencoded);
3380 if (isencoded) {
3381 switch(len) {
3382 case REDIS_RDB_ENC_INT8:
3383 case REDIS_RDB_ENC_INT16:
3384 case REDIS_RDB_ENC_INT32:
3385 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3386 case REDIS_RDB_ENC_LZF:
3387 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3388 default:
3389 redisAssert(0!=0);
3390 }
3391 }
3392
3393 if (len == REDIS_RDB_LENERR) return NULL;
3394 val = sdsnewlen(NULL,len);
3395 if (len && fread(val,len,1,fp) == 0) {
3396 sdsfree(val);
3397 return NULL;
3398 }
3399 return tryObjectSharing(createObject(REDIS_STRING,val));
3400 }
3401
3402 /* For information about double serialization check rdbSaveDoubleValue() */
3403 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3404 char buf[128];
3405 unsigned char len;
3406
3407 if (fread(&len,1,1,fp) == 0) return -1;
3408 switch(len) {
3409 case 255: *val = R_NegInf; return 0;
3410 case 254: *val = R_PosInf; return 0;
3411 case 253: *val = R_Nan; return 0;
3412 default:
3413 if (fread(buf,len,1,fp) == 0) return -1;
3414 buf[len] = '\0';
3415 sscanf(buf, "%lg", val);
3416 return 0;
3417 }
3418 }
3419
3420 /* Load a Redis object of the specified type from the specified file.
3421 * On success a newly allocated object is returned, otherwise NULL. */
3422 static robj *rdbLoadObject(int type, FILE *fp) {
3423 robj *o;
3424
3425 if (type == REDIS_STRING) {
3426 /* Read string value */
3427 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3428 tryObjectEncoding(o);
3429 } else if (type == REDIS_LIST || type == REDIS_SET) {
3430 /* Read list/set value */
3431 uint32_t listlen;
3432
3433 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3434 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3435 /* It's faster to expand the dict to the right size asap in order
3436 * to avoid rehashing */
3437 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3438 dictExpand(o->ptr,listlen);
3439 /* Load every single element of the list/set */
3440 while(listlen--) {
3441 robj *ele;
3442
3443 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3444 tryObjectEncoding(ele);
3445 if (type == REDIS_LIST) {
3446 listAddNodeTail((list*)o->ptr,ele);
3447 } else {
3448 dictAdd((dict*)o->ptr,ele,NULL);
3449 }
3450 }
3451 } else if (type == REDIS_ZSET) {
3452 /* Read list/set value */
3453 uint32_t zsetlen;
3454 zset *zs;
3455
3456 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3457 o = createZsetObject();
3458 zs = o->ptr;
3459 /* Load every single element of the list/set */
3460 while(zsetlen--) {
3461 robj *ele;
3462 double *score = zmalloc(sizeof(double));
3463
3464 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3465 tryObjectEncoding(ele);
3466 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3467 dictAdd(zs->dict,ele,score);
3468 zslInsert(zs->zsl,*score,ele);
3469 incrRefCount(ele); /* added to skiplist */
3470 }
3471 } else {
3472 redisAssert(0 != 0);
3473 }
3474 return o;
3475 }
3476
3477 static int rdbLoad(char *filename) {
3478 FILE *fp;
3479 robj *keyobj = NULL;
3480 uint32_t dbid;
3481 int type, retval, rdbver;
3482 dict *d = server.db[0].dict;
3483 redisDb *db = server.db+0;
3484 char buf[1024];
3485 time_t expiretime = -1, now = time(NULL);
3486 long long loadedkeys = 0;
3487
3488 fp = fopen(filename,"r");
3489 if (!fp) return REDIS_ERR;
3490 if (fread(buf,9,1,fp) == 0) goto eoferr;
3491 buf[9] = '\0';
3492 if (memcmp(buf,"REDIS",5) != 0) {
3493 fclose(fp);
3494 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3495 return REDIS_ERR;
3496 }
3497 rdbver = atoi(buf+5);
3498 if (rdbver != 1) {
3499 fclose(fp);
3500 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3501 return REDIS_ERR;
3502 }
3503 while(1) {
3504 robj *o;
3505
3506 /* Read type. */
3507 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3508 if (type == REDIS_EXPIRETIME) {
3509 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3510 /* We read the time so we need to read the object type again */
3511 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3512 }
3513 if (type == REDIS_EOF) break;
3514 /* Handle SELECT DB opcode as a special case */
3515 if (type == REDIS_SELECTDB) {
3516 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3517 goto eoferr;
3518 if (dbid >= (unsigned)server.dbnum) {
3519 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3520 exit(1);
3521 }
3522 db = server.db+dbid;
3523 d = db->dict;
3524 continue;
3525 }
3526 /* Read key */
3527 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3528 /* Read value */
3529 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3530 /* Add the new object in the hash table */
3531 retval = dictAdd(d,keyobj,o);
3532 if (retval == DICT_ERR) {
3533 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3534 exit(1);
3535 }
3536 /* Set the expire time if needed */
3537 if (expiretime != -1) {
3538 setExpire(db,keyobj,expiretime);
3539 /* Delete this key if already expired */
3540 if (expiretime < now) deleteKey(db,keyobj);
3541 expiretime = -1;
3542 }
3543 keyobj = o = NULL;
3544 /* Handle swapping while loading big datasets when VM is on */
3545 loadedkeys++;
3546 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3547 while (zmalloc_used_memory() > server.vm_max_memory) {
3548 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3549 }
3550 }
3551 }
3552 fclose(fp);
3553 return REDIS_OK;
3554
3555 eoferr: /* unexpected end of file is handled here with a fatal exit */
3556 if (keyobj) decrRefCount(keyobj);
3557 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3558 exit(1);
3559 return REDIS_ERR; /* Just to avoid warning */
3560 }
3561
3562 /*================================== Commands =============================== */
3563
3564 static void authCommand(redisClient *c) {
3565 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3566 c->authenticated = 1;
3567 addReply(c,shared.ok);
3568 } else {
3569 c->authenticated = 0;
3570 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3571 }
3572 }
3573
3574 static void pingCommand(redisClient *c) {
3575 addReply(c,shared.pong);
3576 }
3577
3578 static void echoCommand(redisClient *c) {
3579 addReplyBulkLen(c,c->argv[1]);
3580 addReply(c,c->argv[1]);
3581 addReply(c,shared.crlf);
3582 }
3583
3584 /*=================================== Strings =============================== */
3585
3586 static void setGenericCommand(redisClient *c, int nx) {
3587 int retval;
3588
3589 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3590 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3591 if (retval == DICT_ERR) {
3592 if (!nx) {
3593 /* If the key is about a swapped value, we want a new key object
3594 * to overwrite the old. So we delete the old key in the database.
3595 * This will also make sure that swap pages about the old object
3596 * will be marked as free. */
3597 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3598 incrRefCount(c->argv[1]);
3599 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3600 incrRefCount(c->argv[2]);
3601 } else {
3602 addReply(c,shared.czero);
3603 return;
3604 }
3605 } else {
3606 incrRefCount(c->argv[1]);
3607 incrRefCount(c->argv[2]);
3608 }
3609 server.dirty++;
3610 removeExpire(c->db,c->argv[1]);
3611 addReply(c, nx ? shared.cone : shared.ok);
3612 }
3613
3614 static void setCommand(redisClient *c) {
3615 setGenericCommand(c,0);
3616 }
3617
3618 static void setnxCommand(redisClient *c) {
3619 setGenericCommand(c,1);
3620 }
3621
3622 static int getGenericCommand(redisClient *c) {
3623 robj *o = lookupKeyRead(c->db,c->argv[1]);
3624
3625 if (o == NULL) {
3626 addReply(c,shared.nullbulk);
3627 return REDIS_OK;
3628 } else {
3629 if (o->type != REDIS_STRING) {
3630 addReply(c,shared.wrongtypeerr);
3631 return REDIS_ERR;
3632 } else {
3633 addReplyBulkLen(c,o);
3634 addReply(c,o);
3635 addReply(c,shared.crlf);
3636 return REDIS_OK;
3637 }
3638 }
3639 }
3640
3641 static void getCommand(redisClient *c) {
3642 getGenericCommand(c);
3643 }
3644
3645 static void getsetCommand(redisClient *c) {
3646 if (getGenericCommand(c) == REDIS_ERR) return;
3647 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3648 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3649 } else {
3650 incrRefCount(c->argv[1]);
3651 }
3652 incrRefCount(c->argv[2]);
3653 server.dirty++;
3654 removeExpire(c->db,c->argv[1]);
3655 }
3656
3657 static void mgetCommand(redisClient *c) {
3658 int j;
3659
3660 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3661 for (j = 1; j < c->argc; j++) {
3662 robj *o = lookupKeyRead(c->db,c->argv[j]);
3663 if (o == NULL) {
3664 addReply(c,shared.nullbulk);
3665 } else {
3666 if (o->type != REDIS_STRING) {
3667 addReply(c,shared.nullbulk);
3668 } else {
3669 addReplyBulkLen(c,o);
3670 addReply(c,o);
3671 addReply(c,shared.crlf);
3672 }
3673 }
3674 }
3675 }
3676
3677 static void msetGenericCommand(redisClient *c, int nx) {
3678 int j, busykeys = 0;
3679
3680 if ((c->argc % 2) == 0) {
3681 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3682 return;
3683 }
3684 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3685 * set nothing at all if at least one already key exists. */
3686 if (nx) {
3687 for (j = 1; j < c->argc; j += 2) {
3688 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3689 busykeys++;
3690 }
3691 }
3692 }
3693 if (busykeys) {
3694 addReply(c, shared.czero);
3695 return;
3696 }
3697
3698 for (j = 1; j < c->argc; j += 2) {
3699 int retval;
3700
3701 tryObjectEncoding(c->argv[j+1]);
3702 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3703 if (retval == DICT_ERR) {
3704 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3705 incrRefCount(c->argv[j+1]);
3706 } else {
3707 incrRefCount(c->argv[j]);
3708 incrRefCount(c->argv[j+1]);
3709 }
3710 removeExpire(c->db,c->argv[j]);
3711 }
3712 server.dirty += (c->argc-1)/2;
3713 addReply(c, nx ? shared.cone : shared.ok);
3714 }
3715
3716 static void msetCommand(redisClient *c) {
3717 msetGenericCommand(c,0);
3718 }
3719
3720 static void msetnxCommand(redisClient *c) {
3721 msetGenericCommand(c,1);
3722 }
3723
3724 static void incrDecrCommand(redisClient *c, long long incr) {
3725 long long value;
3726 int retval;
3727 robj *o;
3728
3729 o = lookupKeyWrite(c->db,c->argv[1]);
3730 if (o == NULL) {
3731 value = 0;
3732 } else {
3733 if (o->type != REDIS_STRING) {
3734 value = 0;
3735 } else {
3736 char *eptr;
3737
3738 if (o->encoding == REDIS_ENCODING_RAW)
3739 value = strtoll(o->ptr, &eptr, 10);
3740 else if (o->encoding == REDIS_ENCODING_INT)
3741 value = (long)o->ptr;
3742 else
3743 redisAssert(1 != 1);
3744 }
3745 }
3746
3747 value += incr;
3748 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3749 tryObjectEncoding(o);
3750 retval = dictAdd(c->db->dict,c->argv[1],o);
3751 if (retval == DICT_ERR) {
3752 dictReplace(c->db->dict,c->argv[1],o);
3753 removeExpire(c->db,c->argv[1]);
3754 } else {
3755 incrRefCount(c->argv[1]);
3756 }
3757 server.dirty++;
3758 addReply(c,shared.colon);
3759 addReply(c,o);
3760 addReply(c,shared.crlf);
3761 }
3762
3763 static void incrCommand(redisClient *c) {
3764 incrDecrCommand(c,1);
3765 }
3766
3767 static void decrCommand(redisClient *c) {
3768 incrDecrCommand(c,-1);
3769 }
3770
3771 static void incrbyCommand(redisClient *c) {
3772 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3773 incrDecrCommand(c,incr);
3774 }
3775
3776 static void decrbyCommand(redisClient *c) {
3777 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3778 incrDecrCommand(c,-incr);
3779 }
3780
3781 static void appendCommand(redisClient *c) {
3782 int retval;
3783 size_t totlen;
3784 robj *o;
3785
3786 o = lookupKeyWrite(c->db,c->argv[1]);
3787 if (o == NULL) {
3788 /* Create the key */
3789 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3790 incrRefCount(c->argv[1]);
3791 incrRefCount(c->argv[2]);
3792 totlen = stringObjectLen(c->argv[2]);
3793 } else {
3794 dictEntry *de;
3795
3796 de = dictFind(c->db->dict,c->argv[1]);
3797 assert(de != NULL);
3798
3799 o = dictGetEntryVal(de);
3800 if (o->type != REDIS_STRING) {
3801 addReply(c,shared.wrongtypeerr);
3802 return;
3803 }
3804 /* If the object is specially encoded or shared we have to make
3805 * a copy */
3806 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3807 robj *decoded = getDecodedObject(o);
3808
3809 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3810 decrRefCount(decoded);
3811 dictReplace(c->db->dict,c->argv[1],o);
3812 }
3813 /* APPEND! */
3814 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3815 o->ptr = sdscatlen(o->ptr,
3816 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3817 } else {
3818 o->ptr = sdscatprintf(o->ptr, "%ld",
3819 (unsigned long) c->argv[2]->ptr);
3820 }
3821 totlen = sdslen(o->ptr);
3822 }
3823 server.dirty++;
3824 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3825 }
3826
3827 static void substrCommand(redisClient *c) {
3828 robj *o;
3829 long start = atoi(c->argv[2]->ptr);
3830 long end = atoi(c->argv[3]->ptr);
3831
3832 o = lookupKeyRead(c->db,c->argv[1]);
3833 if (o == NULL) {
3834 addReply(c,shared.nullbulk);
3835 } else {
3836 if (o->type != REDIS_STRING) {
3837 addReply(c,shared.wrongtypeerr);
3838 } else {
3839 size_t rangelen, strlen;
3840 sds range;
3841
3842 o = getDecodedObject(o);
3843 strlen = sdslen(o->ptr);
3844
3845 /* convert negative indexes */
3846 if (start < 0) start = strlen+start;
3847 if (end < 0) end = strlen+end;
3848 if (start < 0) start = 0;
3849 if (end < 0) end = 0;
3850
3851 /* indexes sanity checks */
3852 if (start > end || (size_t)start >= strlen) {
3853 /* Out of range start or start > end result in null reply */
3854 addReply(c,shared.nullbulk);
3855 decrRefCount(o);
3856 return;
3857 }
3858 if ((size_t)end >= strlen) end = strlen-1;
3859 rangelen = (end-start)+1;
3860
3861 /* Return the result */
3862 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",rangelen));
3863 range = sdsnewlen((char*)o->ptr+start,rangelen);
3864 addReplySds(c,range);
3865 addReply(c,shared.crlf);
3866 decrRefCount(o);
3867 }
3868 }
3869 }
3870
3871 /* ========================= Type agnostic commands ========================= */
3872
3873 static void delCommand(redisClient *c) {
3874 int deleted = 0, j;
3875
3876 for (j = 1; j < c->argc; j++) {
3877 if (deleteKey(c->db,c->argv[j])) {
3878 server.dirty++;
3879 deleted++;
3880 }
3881 }
3882 switch(deleted) {
3883 case 0:
3884 addReply(c,shared.czero);
3885 break;
3886 case 1:
3887 addReply(c,shared.cone);
3888 break;
3889 default:
3890 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3891 break;
3892 }
3893 }
3894
3895 static void existsCommand(redisClient *c) {
3896 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
3897 }
3898
3899 static void selectCommand(redisClient *c) {
3900 int id = atoi(c->argv[1]->ptr);
3901
3902 if (selectDb(c,id) == REDIS_ERR) {
3903 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
3904 } else {
3905 addReply(c,shared.ok);
3906 }
3907 }
3908
3909 static void randomkeyCommand(redisClient *c) {
3910 dictEntry *de;
3911
3912 while(1) {
3913 de = dictGetRandomKey(c->db->dict);
3914 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3915 }
3916 if (de == NULL) {
3917 addReply(c,shared.plus);
3918 addReply(c,shared.crlf);
3919 } else {
3920 addReply(c,shared.plus);
3921 addReply(c,dictGetEntryKey(de));
3922 addReply(c,shared.crlf);
3923 }
3924 }
3925
3926 static void keysCommand(redisClient *c) {
3927 dictIterator *di;
3928 dictEntry *de;
3929 sds pattern = c->argv[1]->ptr;
3930 int plen = sdslen(pattern);
3931 unsigned long numkeys = 0;
3932 robj *lenobj = createObject(REDIS_STRING,NULL);
3933
3934 di = dictGetIterator(c->db->dict);
3935 addReply(c,lenobj);
3936 decrRefCount(lenobj);
3937 while((de = dictNext(di)) != NULL) {
3938 robj *keyobj = dictGetEntryKey(de);
3939
3940 sds key = keyobj->ptr;
3941 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3942 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3943 if (expireIfNeeded(c->db,keyobj) == 0) {
3944 addReplyBulkLen(c,keyobj);
3945 addReply(c,keyobj);
3946 addReply(c,shared.crlf);
3947 numkeys++;
3948 }
3949 }
3950 }
3951 dictReleaseIterator(di);
3952 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
3953 }
3954
3955 static void dbsizeCommand(redisClient *c) {
3956 addReplySds(c,
3957 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
3958 }
3959
3960 static void lastsaveCommand(redisClient *c) {
3961 addReplySds(c,
3962 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
3963 }
3964
3965 static void typeCommand(redisClient *c) {
3966 robj *o;
3967 char *type;
3968
3969 o = lookupKeyRead(c->db,c->argv[1]);
3970 if (o == NULL) {
3971 type = "+none";
3972 } else {
3973 switch(o->type) {
3974 case REDIS_STRING: type = "+string"; break;
3975 case REDIS_LIST: type = "+list"; break;
3976 case REDIS_SET: type = "+set"; break;
3977 case REDIS_ZSET: type = "+zset"; break;
3978 default: type = "unknown"; break;
3979 }
3980 }
3981 addReplySds(c,sdsnew(type));
3982 addReply(c,shared.crlf);
3983 }
3984
3985 static void saveCommand(redisClient *c) {
3986 if (server.bgsavechildpid != -1) {
3987 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3988 return;
3989 }
3990 if (rdbSave(server.dbfilename) == REDIS_OK) {
3991 addReply(c,shared.ok);
3992 } else {
3993 addReply(c,shared.err);
3994 }
3995 }
3996
3997 static void bgsaveCommand(redisClient *c) {
3998 if (server.bgsavechildpid != -1) {
3999 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4000 return;
4001 }
4002 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4003 char *status = "+Background saving started\r\n";
4004 addReplySds(c,sdsnew(status));
4005 } else {
4006 addReply(c,shared.err);
4007 }
4008 }
4009
4010 static void shutdownCommand(redisClient *c) {
4011 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4012 /* Kill the saving child if there is a background saving in progress.
4013 We want to avoid race conditions, for instance our saving child may
4014 overwrite the synchronous saving did by SHUTDOWN. */
4015 if (server.bgsavechildpid != -1) {
4016 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4017 kill(server.bgsavechildpid,SIGKILL);
4018 rdbRemoveTempFile(server.bgsavechildpid);
4019 }
4020 if (server.appendonly) {
4021 /* Append only file: fsync() the AOF and exit */
4022 fsync(server.appendfd);
4023 if (server.vm_enabled) unlink(server.vm_swap_file);
4024 exit(0);
4025 } else {
4026 /* Snapshotting. Perform a SYNC SAVE and exit */
4027 if (rdbSave(server.dbfilename) == REDIS_OK) {
4028 if (server.daemonize)
4029 unlink(server.pidfile);
4030 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4031 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4032 if (server.vm_enabled) unlink(server.vm_swap_file);
4033 exit(0);
4034 } else {
4035 /* Ooops.. error saving! The best we can do is to continue operating.
4036 * Note that if there was a background saving process, in the next
4037 * cron() Redis will be notified that the background saving aborted,
4038 * handling special stuff like slaves pending for synchronization... */
4039 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4040 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4041 }
4042 }
4043 }
4044
4045 static void renameGenericCommand(redisClient *c, int nx) {
4046 robj *o;
4047
4048 /* To use the same key as src and dst is probably an error */
4049 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4050 addReply(c,shared.sameobjecterr);
4051 return;
4052 }
4053
4054 o = lookupKeyWrite(c->db,c->argv[1]);
4055 if (o == NULL) {
4056 addReply(c,shared.nokeyerr);
4057 return;
4058 }
4059 incrRefCount(o);
4060 deleteIfVolatile(c->db,c->argv[2]);
4061 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4062 if (nx) {
4063 decrRefCount(o);
4064 addReply(c,shared.czero);
4065 return;
4066 }
4067 dictReplace(c->db->dict,c->argv[2],o);
4068 } else {
4069 incrRefCount(c->argv[2]);
4070 }
4071 deleteKey(c->db,c->argv[1]);
4072 server.dirty++;
4073 addReply(c,nx ? shared.cone : shared.ok);
4074 }
4075
4076 static void renameCommand(redisClient *c) {
4077 renameGenericCommand(c,0);
4078 }
4079
4080 static void renamenxCommand(redisClient *c) {
4081 renameGenericCommand(c,1);
4082 }
4083
4084 static void moveCommand(redisClient *c) {
4085 robj *o;
4086 redisDb *src, *dst;
4087 int srcid;
4088
4089 /* Obtain source and target DB pointers */
4090 src = c->db;
4091 srcid = c->db->id;
4092 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4093 addReply(c,shared.outofrangeerr);
4094 return;
4095 }
4096 dst = c->db;
4097 selectDb(c,srcid); /* Back to the source DB */
4098
4099 /* If the user is moving using as target the same
4100 * DB as the source DB it is probably an error. */
4101 if (src == dst) {
4102 addReply(c,shared.sameobjecterr);
4103 return;
4104 }
4105
4106 /* Check if the element exists and get a reference */
4107 o = lookupKeyWrite(c->db,c->argv[1]);
4108 if (!o) {
4109 addReply(c,shared.czero);
4110 return;
4111 }
4112
4113 /* Try to add the element to the target DB */
4114 deleteIfVolatile(dst,c->argv[1]);
4115 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4116 addReply(c,shared.czero);
4117 return;
4118 }
4119 incrRefCount(c->argv[1]);
4120 incrRefCount(o);
4121
4122 /* OK! key moved, free the entry in the source DB */
4123 deleteKey(src,c->argv[1]);
4124 server.dirty++;
4125 addReply(c,shared.cone);
4126 }
4127
4128 /* =================================== Lists ================================ */
4129 static void pushGenericCommand(redisClient *c, int where) {
4130 robj *lobj;
4131 list *list;
4132
4133 lobj = lookupKeyWrite(c->db,c->argv[1]);
4134 if (lobj == NULL) {
4135 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4136 addReply(c,shared.cone);
4137 return;
4138 }
4139 lobj = createListObject();
4140 list = lobj->ptr;
4141 if (where == REDIS_HEAD) {
4142 listAddNodeHead(list,c->argv[2]);
4143 } else {
4144 listAddNodeTail(list,c->argv[2]);
4145 }
4146 dictAdd(c->db->dict,c->argv[1],lobj);
4147 incrRefCount(c->argv[1]);
4148 incrRefCount(c->argv[2]);
4149 } else {
4150 if (lobj->type != REDIS_LIST) {
4151 addReply(c,shared.wrongtypeerr);
4152 return;
4153 }
4154 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4155 addReply(c,shared.cone);
4156 return;
4157 }
4158 list = lobj->ptr;
4159 if (where == REDIS_HEAD) {
4160 listAddNodeHead(list,c->argv[2]);
4161 } else {
4162 listAddNodeTail(list,c->argv[2]);
4163 }
4164 incrRefCount(c->argv[2]);
4165 }
4166 server.dirty++;
4167 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4168 }
4169
4170 static void lpushCommand(redisClient *c) {
4171 pushGenericCommand(c,REDIS_HEAD);
4172 }
4173
4174 static void rpushCommand(redisClient *c) {
4175 pushGenericCommand(c,REDIS_TAIL);
4176 }
4177
4178 static void llenCommand(redisClient *c) {
4179 robj *o;
4180 list *l;
4181
4182 o = lookupKeyRead(c->db,c->argv[1]);
4183 if (o == NULL) {
4184 addReply(c,shared.czero);
4185 return;
4186 } else {
4187 if (o->type != REDIS_LIST) {
4188 addReply(c,shared.wrongtypeerr);
4189 } else {
4190 l = o->ptr;
4191 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
4192 }
4193 }
4194 }
4195
4196 static void lindexCommand(redisClient *c) {
4197 robj *o;
4198 int index = atoi(c->argv[2]->ptr);
4199
4200 o = lookupKeyRead(c->db,c->argv[1]);
4201 if (o == NULL) {
4202 addReply(c,shared.nullbulk);
4203 } else {
4204 if (o->type != REDIS_LIST) {
4205 addReply(c,shared.wrongtypeerr);
4206 } else {
4207 list *list = o->ptr;
4208 listNode *ln;
4209
4210 ln = listIndex(list, index);
4211 if (ln == NULL) {
4212 addReply(c,shared.nullbulk);
4213 } else {
4214 robj *ele = listNodeValue(ln);
4215 addReplyBulkLen(c,ele);
4216 addReply(c,ele);
4217 addReply(c,shared.crlf);
4218 }
4219 }
4220 }
4221 }
4222
4223 static void lsetCommand(redisClient *c) {
4224 robj *o;
4225 int index = atoi(c->argv[2]->ptr);
4226
4227 o = lookupKeyWrite(c->db,c->argv[1]);
4228 if (o == NULL) {
4229 addReply(c,shared.nokeyerr);
4230 } else {
4231 if (o->type != REDIS_LIST) {
4232 addReply(c,shared.wrongtypeerr);
4233 } else {
4234 list *list = o->ptr;
4235 listNode *ln;
4236
4237 ln = listIndex(list, index);
4238 if (ln == NULL) {
4239 addReply(c,shared.outofrangeerr);
4240 } else {
4241 robj *ele = listNodeValue(ln);
4242
4243 decrRefCount(ele);
4244 listNodeValue(ln) = c->argv[3];
4245 incrRefCount(c->argv[3]);
4246 addReply(c,shared.ok);
4247 server.dirty++;
4248 }
4249 }
4250 }
4251 }
4252
4253 static void popGenericCommand(redisClient *c, int where) {
4254 robj *o;
4255
4256 o = lookupKeyWrite(c->db,c->argv[1]);
4257 if (o == NULL) {
4258 addReply(c,shared.nullbulk);
4259 } else {
4260 if (o->type != REDIS_LIST) {
4261 addReply(c,shared.wrongtypeerr);
4262 } else {
4263 list *list = o->ptr;
4264 listNode *ln;
4265
4266 if (where == REDIS_HEAD)
4267 ln = listFirst(list);
4268 else
4269 ln = listLast(list);
4270
4271 if (ln == NULL) {
4272 addReply(c,shared.nullbulk);
4273 } else {
4274 robj *ele = listNodeValue(ln);
4275 addReplyBulkLen(c,ele);
4276 addReply(c,ele);
4277 addReply(c,shared.crlf);
4278 listDelNode(list,ln);
4279 server.dirty++;
4280 }
4281 }
4282 }
4283 }
4284
4285 static void lpopCommand(redisClient *c) {
4286 popGenericCommand(c,REDIS_HEAD);
4287 }
4288
4289 static void rpopCommand(redisClient *c) {
4290 popGenericCommand(c,REDIS_TAIL);
4291 }
4292
4293 static void lrangeCommand(redisClient *c) {
4294 robj *o;
4295 int start = atoi(c->argv[2]->ptr);
4296 int end = atoi(c->argv[3]->ptr);
4297
4298 o = lookupKeyRead(c->db,c->argv[1]);
4299 if (o == NULL) {
4300 addReply(c,shared.nullmultibulk);
4301 } else {
4302 if (o->type != REDIS_LIST) {
4303 addReply(c,shared.wrongtypeerr);
4304 } else {
4305 list *list = o->ptr;
4306 listNode *ln;
4307 int llen = listLength(list);
4308 int rangelen, j;
4309 robj *ele;
4310
4311 /* convert negative indexes */
4312 if (start < 0) start = llen+start;
4313 if (end < 0) end = llen+end;
4314 if (start < 0) start = 0;
4315 if (end < 0) end = 0;
4316
4317 /* indexes sanity checks */
4318 if (start > end || start >= llen) {
4319 /* Out of range start or start > end result in empty list */
4320 addReply(c,shared.emptymultibulk);
4321 return;
4322 }
4323 if (end >= llen) end = llen-1;
4324 rangelen = (end-start)+1;
4325
4326 /* Return the result in form of a multi-bulk reply */
4327 ln = listIndex(list, start);
4328 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4329 for (j = 0; j < rangelen; j++) {
4330 ele = listNodeValue(ln);
4331 addReplyBulkLen(c,ele);
4332 addReply(c,ele);
4333 addReply(c,shared.crlf);
4334 ln = ln->next;
4335 }
4336 }
4337 }
4338 }
4339
4340 static void ltrimCommand(redisClient *c) {
4341 robj *o;
4342 int start = atoi(c->argv[2]->ptr);
4343 int end = atoi(c->argv[3]->ptr);
4344
4345 o = lookupKeyWrite(c->db,c->argv[1]);
4346 if (o == NULL) {
4347 addReply(c,shared.ok);
4348 } else {
4349 if (o->type != REDIS_LIST) {
4350 addReply(c,shared.wrongtypeerr);
4351 } else {
4352 list *list = o->ptr;
4353 listNode *ln;
4354 int llen = listLength(list);
4355 int j, ltrim, rtrim;
4356
4357 /* convert negative indexes */
4358 if (start < 0) start = llen+start;
4359 if (end < 0) end = llen+end;
4360 if (start < 0) start = 0;
4361 if (end < 0) end = 0;
4362
4363 /* indexes sanity checks */
4364 if (start > end || start >= llen) {
4365 /* Out of range start or start > end result in empty list */
4366 ltrim = llen;
4367 rtrim = 0;
4368 } else {
4369 if (end >= llen) end = llen-1;
4370 ltrim = start;
4371 rtrim = llen-end-1;
4372 }
4373
4374 /* Remove list elements to perform the trim */
4375 for (j = 0; j < ltrim; j++) {
4376 ln = listFirst(list);
4377 listDelNode(list,ln);
4378 }
4379 for (j = 0; j < rtrim; j++) {
4380 ln = listLast(list);
4381 listDelNode(list,ln);
4382 }
4383 server.dirty++;
4384 addReply(c,shared.ok);
4385 }
4386 }
4387 }
4388
4389 static void lremCommand(redisClient *c) {
4390 robj *o;
4391
4392 o = lookupKeyWrite(c->db,c->argv[1]);
4393 if (o == NULL) {
4394 addReply(c,shared.czero);
4395 } else {
4396 if (o->type != REDIS_LIST) {
4397 addReply(c,shared.wrongtypeerr);
4398 } else {
4399 list *list = o->ptr;
4400 listNode *ln, *next;
4401 int toremove = atoi(c->argv[2]->ptr);
4402 int removed = 0;
4403 int fromtail = 0;
4404
4405 if (toremove < 0) {
4406 toremove = -toremove;
4407 fromtail = 1;
4408 }
4409 ln = fromtail ? list->tail : list->head;
4410 while (ln) {
4411 robj *ele = listNodeValue(ln);
4412
4413 next = fromtail ? ln->prev : ln->next;
4414 if (compareStringObjects(ele,c->argv[3]) == 0) {
4415 listDelNode(list,ln);
4416 server.dirty++;
4417 removed++;
4418 if (toremove && removed == toremove) break;
4419 }
4420 ln = next;
4421 }
4422 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4423 }
4424 }
4425 }
4426
4427 /* This is the semantic of this command:
4428 * RPOPLPUSH srclist dstlist:
4429 * IF LLEN(srclist) > 0
4430 * element = RPOP srclist
4431 * LPUSH dstlist element
4432 * RETURN element
4433 * ELSE
4434 * RETURN nil
4435 * END
4436 * END
4437 *
4438 * The idea is to be able to get an element from a list in a reliable way
4439 * since the element is not just returned but pushed against another list
4440 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4441 */
4442 static void rpoplpushcommand(redisClient *c) {
4443 robj *sobj;
4444
4445 sobj = lookupKeyWrite(c->db,c->argv[1]);
4446 if (sobj == NULL) {
4447 addReply(c,shared.nullbulk);
4448 } else {
4449 if (sobj->type != REDIS_LIST) {
4450 addReply(c,shared.wrongtypeerr);
4451 } else {
4452 list *srclist = sobj->ptr;
4453 listNode *ln = listLast(srclist);
4454
4455 if (ln == NULL) {
4456 addReply(c,shared.nullbulk);
4457 } else {
4458 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4459 robj *ele = listNodeValue(ln);
4460 list *dstlist;
4461
4462 if (dobj && dobj->type != REDIS_LIST) {
4463 addReply(c,shared.wrongtypeerr);
4464 return;
4465 }
4466
4467 /* Add the element to the target list (unless it's directly
4468 * passed to some BLPOP-ing client */
4469 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4470 if (dobj == NULL) {
4471 /* Create the list if the key does not exist */
4472 dobj = createListObject();
4473 dictAdd(c->db->dict,c->argv[2],dobj);
4474 incrRefCount(c->argv[2]);
4475 }
4476 dstlist = dobj->ptr;
4477 listAddNodeHead(dstlist,ele);
4478 incrRefCount(ele);
4479 }
4480
4481 /* Send the element to the client as reply as well */
4482 addReplyBulkLen(c,ele);
4483 addReply(c,ele);
4484 addReply(c,shared.crlf);
4485
4486 /* Finally remove the element from the source list */
4487 listDelNode(srclist,ln);
4488 server.dirty++;
4489 }
4490 }
4491 }
4492 }
4493
4494
4495 /* ==================================== Sets ================================ */
4496
4497 static void saddCommand(redisClient *c) {
4498 robj *set;
4499
4500 set = lookupKeyWrite(c->db,c->argv[1]);
4501 if (set == NULL) {
4502 set = createSetObject();
4503 dictAdd(c->db->dict,c->argv[1],set);
4504 incrRefCount(c->argv[1]);
4505 } else {
4506 if (set->type != REDIS_SET) {
4507 addReply(c,shared.wrongtypeerr);
4508 return;
4509 }
4510 }
4511 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4512 incrRefCount(c->argv[2]);
4513 server.dirty++;
4514 addReply(c,shared.cone);
4515 } else {
4516 addReply(c,shared.czero);
4517 }
4518 }
4519
4520 static void sremCommand(redisClient *c) {
4521 robj *set;
4522
4523 set = lookupKeyWrite(c->db,c->argv[1]);
4524 if (set == NULL) {
4525 addReply(c,shared.czero);
4526 } else {
4527 if (set->type != REDIS_SET) {
4528 addReply(c,shared.wrongtypeerr);
4529 return;
4530 }
4531 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4532 server.dirty++;
4533 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4534 addReply(c,shared.cone);
4535 } else {
4536 addReply(c,shared.czero);
4537 }
4538 }
4539 }
4540
4541 static void smoveCommand(redisClient *c) {
4542 robj *srcset, *dstset;
4543
4544 srcset = lookupKeyWrite(c->db,c->argv[1]);
4545 dstset = lookupKeyWrite(c->db,c->argv[2]);
4546
4547 /* If the source key does not exist return 0, if it's of the wrong type
4548 * raise an error */
4549 if (srcset == NULL || srcset->type != REDIS_SET) {
4550 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4551 return;
4552 }
4553 /* Error if the destination key is not a set as well */
4554 if (dstset && dstset->type != REDIS_SET) {
4555 addReply(c,shared.wrongtypeerr);
4556 return;
4557 }
4558 /* Remove the element from the source set */
4559 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4560 /* Key not found in the src set! return zero */
4561 addReply(c,shared.czero);
4562 return;
4563 }
4564 server.dirty++;
4565 /* Add the element to the destination set */
4566 if (!dstset) {
4567 dstset = createSetObject();
4568 dictAdd(c->db->dict,c->argv[2],dstset);
4569 incrRefCount(c->argv[2]);
4570 }
4571 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4572 incrRefCount(c->argv[3]);
4573 addReply(c,shared.cone);
4574 }
4575
4576 static void sismemberCommand(redisClient *c) {
4577 robj *set;
4578
4579 set = lookupKeyRead(c->db,c->argv[1]);
4580 if (set == NULL) {
4581 addReply(c,shared.czero);
4582 } else {
4583 if (set->type != REDIS_SET) {
4584 addReply(c,shared.wrongtypeerr);
4585 return;
4586 }
4587 if (dictFind(set->ptr,c->argv[2]))
4588 addReply(c,shared.cone);
4589 else
4590 addReply(c,shared.czero);
4591 }
4592 }
4593
4594 static void scardCommand(redisClient *c) {
4595 robj *o;
4596 dict *s;
4597
4598 o = lookupKeyRead(c->db,c->argv[1]);
4599 if (o == NULL) {
4600 addReply(c,shared.czero);
4601 return;
4602 } else {
4603 if (o->type != REDIS_SET) {
4604 addReply(c,shared.wrongtypeerr);
4605 } else {
4606 s = o->ptr;
4607 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4608 dictSize(s)));
4609 }
4610 }
4611 }
4612
4613 static void spopCommand(redisClient *c) {
4614 robj *set;
4615 dictEntry *de;
4616
4617 set = lookupKeyWrite(c->db,c->argv[1]);
4618 if (set == NULL) {
4619 addReply(c,shared.nullbulk);
4620 } else {
4621 if (set->type != REDIS_SET) {
4622 addReply(c,shared.wrongtypeerr);
4623 return;
4624 }
4625 de = dictGetRandomKey(set->ptr);
4626 if (de == NULL) {
4627 addReply(c,shared.nullbulk);
4628 } else {
4629 robj *ele = dictGetEntryKey(de);
4630
4631 addReplyBulkLen(c,ele);
4632 addReply(c,ele);
4633 addReply(c,shared.crlf);
4634 dictDelete(set->ptr,ele);
4635 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4636 server.dirty++;
4637 }
4638 }
4639 }
4640
4641 static void srandmemberCommand(redisClient *c) {
4642 robj *set;
4643 dictEntry *de;
4644
4645 set = lookupKeyRead(c->db,c->argv[1]);
4646 if (set == NULL) {
4647 addReply(c,shared.nullbulk);
4648 } else {
4649 if (set->type != REDIS_SET) {
4650 addReply(c,shared.wrongtypeerr);
4651 return;
4652 }
4653 de = dictGetRandomKey(set->ptr);
4654 if (de == NULL) {
4655 addReply(c,shared.nullbulk);
4656 } else {
4657 robj *ele = dictGetEntryKey(de);
4658
4659 addReplyBulkLen(c,ele);
4660 addReply(c,ele);
4661 addReply(c,shared.crlf);
4662 }
4663 }
4664 }
4665
4666 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4667 dict **d1 = (void*) s1, **d2 = (void*) s2;
4668
4669 return dictSize(*d1)-dictSize(*d2);
4670 }
4671
4672 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4673 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4674 dictIterator *di;
4675 dictEntry *de;
4676 robj *lenobj = NULL, *dstset = NULL;
4677 unsigned long j, cardinality = 0;
4678
4679 for (j = 0; j < setsnum; j++) {
4680 robj *setobj;
4681
4682 setobj = dstkey ?
4683 lookupKeyWrite(c->db,setskeys[j]) :
4684 lookupKeyRead(c->db,setskeys[j]);
4685 if (!setobj) {
4686 zfree(dv);
4687 if (dstkey) {
4688 if (deleteKey(c->db,dstkey))
4689 server.dirty++;
4690 addReply(c,shared.czero);
4691 } else {
4692 addReply(c,shared.nullmultibulk);
4693 }
4694 return;
4695 }
4696 if (setobj->type != REDIS_SET) {
4697 zfree(dv);
4698 addReply(c,shared.wrongtypeerr);
4699 return;
4700 }
4701 dv[j] = setobj->ptr;
4702 }
4703 /* Sort sets from the smallest to largest, this will improve our
4704 * algorithm's performace */
4705 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4706
4707 /* The first thing we should output is the total number of elements...
4708 * since this is a multi-bulk write, but at this stage we don't know
4709 * the intersection set size, so we use a trick, append an empty object
4710 * to the output list and save the pointer to later modify it with the
4711 * right length */
4712 if (!dstkey) {
4713 lenobj = createObject(REDIS_STRING,NULL);
4714 addReply(c,lenobj);
4715 decrRefCount(lenobj);
4716 } else {
4717 /* If we have a target key where to store the resulting set
4718 * create this key with an empty set inside */
4719 dstset = createSetObject();
4720 }
4721
4722 /* Iterate all the elements of the first (smallest) set, and test
4723 * the element against all the other sets, if at least one set does
4724 * not include the element it is discarded */
4725 di = dictGetIterator(dv[0]);
4726
4727 while((de = dictNext(di)) != NULL) {
4728 robj *ele;
4729
4730 for (j = 1; j < setsnum; j++)
4731 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4732 if (j != setsnum)
4733 continue; /* at least one set does not contain the member */
4734 ele = dictGetEntryKey(de);
4735 if (!dstkey) {
4736 addReplyBulkLen(c,ele);
4737 addReply(c,ele);
4738 addReply(c,shared.crlf);
4739 cardinality++;
4740 } else {
4741 dictAdd(dstset->ptr,ele,NULL);
4742 incrRefCount(ele);
4743 }
4744 }
4745 dictReleaseIterator(di);
4746
4747 if (dstkey) {
4748 /* Store the resulting set into the target */
4749 deleteKey(c->db,dstkey);
4750 dictAdd(c->db->dict,dstkey,dstset);
4751 incrRefCount(dstkey);
4752 }
4753
4754 if (!dstkey) {
4755 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4756 } else {
4757 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4758 dictSize((dict*)dstset->ptr)));
4759 server.dirty++;
4760 }
4761 zfree(dv);
4762 }
4763
4764 static void sinterCommand(redisClient *c) {
4765 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4766 }
4767
4768 static void sinterstoreCommand(redisClient *c) {
4769 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4770 }
4771
4772 #define REDIS_OP_UNION 0
4773 #define REDIS_OP_DIFF 1
4774
4775 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4776 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4777 dictIterator *di;
4778 dictEntry *de;
4779 robj *dstset = NULL;
4780 int j, cardinality = 0;
4781
4782 for (j = 0; j < setsnum; j++) {
4783 robj *setobj;
4784
4785 setobj = dstkey ?
4786 lookupKeyWrite(c->db,setskeys[j]) :
4787 lookupKeyRead(c->db,setskeys[j]);
4788 if (!setobj) {
4789 dv[j] = NULL;
4790 continue;
4791 }
4792 if (setobj->type != REDIS_SET) {
4793 zfree(dv);
4794 addReply(c,shared.wrongtypeerr);
4795 return;
4796 }
4797 dv[j] = setobj->ptr;
4798 }
4799
4800 /* We need a temp set object to store our union. If the dstkey
4801 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4802 * this set object will be the resulting object to set into the target key*/
4803 dstset = createSetObject();
4804
4805 /* Iterate all the elements of all the sets, add every element a single
4806 * time to the result set */
4807 for (j = 0; j < setsnum; j++) {
4808 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4809 if (!dv[j]) continue; /* non existing keys are like empty sets */
4810
4811 di = dictGetIterator(dv[j]);
4812
4813 while((de = dictNext(di)) != NULL) {
4814 robj *ele;
4815
4816 /* dictAdd will not add the same element multiple times */
4817 ele = dictGetEntryKey(de);
4818 if (op == REDIS_OP_UNION || j == 0) {
4819 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4820 incrRefCount(ele);
4821 cardinality++;
4822 }
4823 } else if (op == REDIS_OP_DIFF) {
4824 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4825 cardinality--;
4826 }
4827 }
4828 }
4829 dictReleaseIterator(di);
4830
4831 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
4832 }
4833
4834 /* Output the content of the resulting set, if not in STORE mode */
4835 if (!dstkey) {
4836 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4837 di = dictGetIterator(dstset->ptr);
4838 while((de = dictNext(di)) != NULL) {
4839 robj *ele;
4840
4841 ele = dictGetEntryKey(de);
4842 addReplyBulkLen(c,ele);
4843 addReply(c,ele);
4844 addReply(c,shared.crlf);
4845 }
4846 dictReleaseIterator(di);
4847 } else {
4848 /* If we have a target key where to store the resulting set
4849 * create this key with the result set inside */
4850 deleteKey(c->db,dstkey);
4851 dictAdd(c->db->dict,dstkey,dstset);
4852 incrRefCount(dstkey);
4853 }
4854
4855 /* Cleanup */
4856 if (!dstkey) {
4857 decrRefCount(dstset);
4858 } else {
4859 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4860 dictSize((dict*)dstset->ptr)));
4861 server.dirty++;
4862 }
4863 zfree(dv);
4864 }
4865
4866 static void sunionCommand(redisClient *c) {
4867 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4868 }
4869
4870 static void sunionstoreCommand(redisClient *c) {
4871 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4872 }
4873
4874 static void sdiffCommand(redisClient *c) {
4875 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4876 }
4877
4878 static void sdiffstoreCommand(redisClient *c) {
4879 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4880 }
4881
4882 /* ==================================== ZSets =============================== */
4883
4884 /* ZSETs are ordered sets using two data structures to hold the same elements
4885 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4886 * data structure.
4887 *
4888 * The elements are added to an hash table mapping Redis objects to scores.
4889 * At the same time the elements are added to a skip list mapping scores
4890 * to Redis objects (so objects are sorted by scores in this "view"). */
4891
4892 /* This skiplist implementation is almost a C translation of the original
4893 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4894 * Alternative to Balanced Trees", modified in three ways:
4895 * a) this implementation allows for repeated values.
4896 * b) the comparison is not just by key (our 'score') but by satellite data.
4897 * c) there is a back pointer, so it's a doubly linked list with the back
4898 * pointers being only at "level 1". This allows to traverse the list
4899 * from tail to head, useful for ZREVRANGE. */
4900
4901 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4902 zskiplistNode *zn = zmalloc(sizeof(*zn));
4903
4904 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4905 if (level > 0)
4906 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
4907 zn->score = score;
4908 zn->obj = obj;
4909 return zn;
4910 }
4911
4912 static zskiplist *zslCreate(void) {
4913 int j;
4914 zskiplist *zsl;
4915
4916 zsl = zmalloc(sizeof(*zsl));
4917 zsl->level = 1;
4918 zsl->length = 0;
4919 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4920 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4921 zsl->header->forward[j] = NULL;
4922
4923 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4924 if (j < ZSKIPLIST_MAXLEVEL-1)
4925 zsl->header->span[j] = 0;
4926 }
4927 zsl->header->backward = NULL;
4928 zsl->tail = NULL;
4929 return zsl;
4930 }
4931
4932 static void zslFreeNode(zskiplistNode *node) {
4933 decrRefCount(node->obj);
4934 zfree(node->forward);
4935 zfree(node->span);
4936 zfree(node);
4937 }
4938
4939 static void zslFree(zskiplist *zsl) {
4940 zskiplistNode *node = zsl->header->forward[0], *next;
4941
4942 zfree(zsl->header->forward);
4943 zfree(zsl->header->span);
4944 zfree(zsl->header);
4945 while(node) {
4946 next = node->forward[0];
4947 zslFreeNode(node);
4948 node = next;
4949 }
4950 zfree(zsl);
4951 }
4952
4953 static int zslRandomLevel(void) {
4954 int level = 1;
4955 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4956 level += 1;
4957 return level;
4958 }
4959
4960 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4961 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4962 unsigned int rank[ZSKIPLIST_MAXLEVEL];
4963 int i, level;
4964
4965 x = zsl->header;
4966 for (i = zsl->level-1; i >= 0; i--) {
4967 /* store rank that is crossed to reach the insert position */
4968 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
4969
4970 while (x->forward[i] &&
4971 (x->forward[i]->score < score ||
4972 (x->forward[i]->score == score &&
4973 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
4974 rank[i] += i > 0 ? x->span[i-1] : 1;
4975 x = x->forward[i];
4976 }
4977 update[i] = x;
4978 }
4979 /* we assume the key is not already inside, since we allow duplicated
4980 * scores, and the re-insertion of score and redis object should never
4981 * happpen since the caller of zslInsert() should test in the hash table
4982 * if the element is already inside or not. */
4983 level = zslRandomLevel();
4984 if (level > zsl->level) {
4985 for (i = zsl->level; i < level; i++) {
4986 rank[i] = 0;
4987 update[i] = zsl->header;
4988 update[i]->span[i-1] = zsl->length;
4989 }
4990 zsl->level = level;
4991 }
4992 x = zslCreateNode(level,score,obj);
4993 for (i = 0; i < level; i++) {
4994 x->forward[i] = update[i]->forward[i];
4995 update[i]->forward[i] = x;
4996
4997 /* update span covered by update[i] as x is inserted here */
4998 if (i > 0) {
4999 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5000 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5001 }
5002 }
5003
5004 /* increment span for untouched levels */
5005 for (i = level; i < zsl->level; i++) {
5006 update[i]->span[i-1]++;
5007 }
5008
5009 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5010 if (x->forward[0])
5011 x->forward[0]->backward = x;
5012 else
5013 zsl->tail = x;
5014 zsl->length++;
5015 }
5016
5017 /* Delete an element with matching score/object from the skiplist. */
5018 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5019 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5020 int i;
5021
5022 x = zsl->header;
5023 for (i = zsl->level-1; i >= 0; i--) {
5024 while (x->forward[i] &&
5025 (x->forward[i]->score < score ||
5026 (x->forward[i]->score == score &&
5027 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5028 x = x->forward[i];
5029 update[i] = x;
5030 }
5031 /* We may have multiple elements with the same score, what we need
5032 * is to find the element with both the right score and object. */
5033 x = x->forward[0];
5034 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5035 for (i = 0; i < zsl->level; i++) {
5036 if (update[i]->forward[i] == x) {
5037 if (i > 0) {
5038 update[i]->span[i-1] += x->span[i-1] - 1;
5039 }
5040 update[i]->forward[i] = x->forward[i];
5041 } else {
5042 /* invariant: i > 0, because update[0]->forward[0]
5043 * is always equal to x */
5044 update[i]->span[i-1] -= 1;
5045 }
5046 }
5047 if (x->forward[0]) {
5048 x->forward[0]->backward = x->backward;
5049 } else {
5050 zsl->tail = x->backward;
5051 }
5052 zslFreeNode(x);
5053 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5054 zsl->level--;
5055 zsl->length--;
5056 return 1;
5057 } else {
5058 return 0; /* not found */
5059 }
5060 return 0; /* not found */
5061 }
5062
5063 /* Delete all the elements with score between min and max from the skiplist.
5064 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5065 * Note that this function takes the reference to the hash table view of the
5066 * sorted set, in order to remove the elements from the hash table too. */
5067 static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
5068 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5069 unsigned long removed = 0;
5070 int i;
5071
5072 x = zsl->header;
5073 for (i = zsl->level-1; i >= 0; i--) {
5074 while (x->forward[i] && x->forward[i]->score < min)
5075 x = x->forward[i];
5076 update[i] = x;
5077 }
5078 /* We may have multiple elements with the same score, what we need
5079 * is to find the element with both the right score and object. */
5080 x = x->forward[0];
5081 while (x && x->score <= max) {
5082 zskiplistNode *next;
5083
5084 for (i = 0; i < zsl->level; i++) {
5085 if (update[i]->forward[i] == x) {
5086 if (i > 0) {
5087 update[i]->span[i-1] += x->span[i-1] - 1;
5088 }
5089 update[i]->forward[i] = x->forward[i];
5090 } else {
5091 /* invariant: i > 0, because update[0]->forward[0]
5092 * is always equal to x */
5093 update[i]->span[i-1] -= 1;
5094 }
5095 }
5096 if (x->forward[0]) {
5097 x->forward[0]->backward = x->backward;
5098 } else {
5099 zsl->tail = x->backward;
5100 }
5101 next = x->forward[0];
5102 dictDelete(dict,x->obj);
5103 zslFreeNode(x);
5104 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5105 zsl->level--;
5106 zsl->length--;
5107 removed++;
5108 x = next;
5109 }
5110 return removed; /* not found */
5111 }
5112
5113 /* Find the first node having a score equal or greater than the specified one.
5114 * Returns NULL if there is no match. */
5115 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5116 zskiplistNode *x;
5117 int i;
5118
5119 x = zsl->header;
5120 for (i = zsl->level-1; i >= 0; i--) {
5121 while (x->forward[i] && x->forward[i]->score < score)
5122 x = x->forward[i];
5123 }
5124 /* We may have multiple elements with the same score, what we need
5125 * is to find the element with both the right score and object. */
5126 return x->forward[0];
5127 }
5128
5129 /* Find the rank for an element by both score and key.
5130 * Returns 0 when the element cannot be found, rank otherwise.
5131 * Note that the rank is 1-based due to the span of zsl->header to the
5132 * first element. */
5133 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5134 zskiplistNode *x;
5135 unsigned long rank = 0;
5136 int i;
5137
5138 x = zsl->header;
5139 for (i = zsl->level-1; i >= 0; i--) {
5140 while (x->forward[i] &&
5141 (x->forward[i]->score < score ||
5142 (x->forward[i]->score == score &&
5143 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5144 rank += i > 0 ? x->span[i-1] : 1;
5145 x = x->forward[i];
5146 }
5147
5148 /* x might be equal to zsl->header, so test if obj is non-NULL */
5149 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5150 return rank;
5151 }
5152 }
5153 return 0;
5154 }
5155
5156 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5157 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5158 zskiplistNode *x;
5159 unsigned long traversed = 0;
5160 int i;
5161
5162 x = zsl->header;
5163 for (i = zsl->level-1; i >= 0; i--) {
5164 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) <= rank) {
5165 traversed += i > 0 ? x->span[i-1] : 1;
5166 x = x->forward[i];
5167 }
5168
5169 if (traversed == rank) {
5170 return x;
5171 }
5172 }
5173 return NULL;
5174 }
5175
5176 /* The actual Z-commands implementations */
5177
5178 /* This generic command implements both ZADD and ZINCRBY.
5179 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5180 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5181 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5182 robj *zsetobj;
5183 zset *zs;
5184 double *score;
5185
5186 zsetobj = lookupKeyWrite(c->db,key);
5187 if (zsetobj == NULL) {
5188 zsetobj = createZsetObject();
5189 dictAdd(c->db->dict,key,zsetobj);
5190 incrRefCount(key);
5191 } else {
5192 if (zsetobj->type != REDIS_ZSET) {
5193 addReply(c,shared.wrongtypeerr);
5194 return;
5195 }
5196 }
5197 zs = zsetobj->ptr;
5198
5199 /* Ok now since we implement both ZADD and ZINCRBY here the code
5200 * needs to handle the two different conditions. It's all about setting
5201 * '*score', that is, the new score to set, to the right value. */
5202 score = zmalloc(sizeof(double));
5203 if (doincrement) {
5204 dictEntry *de;
5205
5206 /* Read the old score. If the element was not present starts from 0 */
5207 de = dictFind(zs->dict,ele);
5208 if (de) {
5209 double *oldscore = dictGetEntryVal(de);
5210 *score = *oldscore + scoreval;
5211 } else {
5212 *score = scoreval;
5213 }
5214 } else {
5215 *score = scoreval;
5216 }
5217
5218 /* What follows is a simple remove and re-insert operation that is common
5219 * to both ZADD and ZINCRBY... */
5220 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5221 /* case 1: New element */
5222 incrRefCount(ele); /* added to hash */
5223 zslInsert(zs->zsl,*score,ele);
5224 incrRefCount(ele); /* added to skiplist */
5225 server.dirty++;
5226 if (doincrement)
5227 addReplyDouble(c,*score);
5228 else
5229 addReply(c,shared.cone);
5230 } else {
5231 dictEntry *de;
5232 double *oldscore;
5233
5234 /* case 2: Score update operation */
5235 de = dictFind(zs->dict,ele);
5236 redisAssert(de != NULL);
5237 oldscore = dictGetEntryVal(de);
5238 if (*score != *oldscore) {
5239 int deleted;
5240
5241 /* Remove and insert the element in the skip list with new score */
5242 deleted = zslDelete(zs->zsl,*oldscore,ele);
5243 redisAssert(deleted != 0);
5244 zslInsert(zs->zsl,*score,ele);
5245 incrRefCount(ele);
5246 /* Update the score in the hash table */
5247 dictReplace(zs->dict,ele,score);
5248 server.dirty++;
5249 } else {
5250 zfree(score);
5251 }
5252 if (doincrement)
5253 addReplyDouble(c,*score);
5254 else
5255 addReply(c,shared.czero);
5256 }
5257 }
5258
5259 static void zaddCommand(redisClient *c) {
5260 double scoreval;
5261
5262 scoreval = strtod(c->argv[2]->ptr,NULL);
5263 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5264 }
5265
5266 static void zincrbyCommand(redisClient *c) {
5267 double scoreval;
5268
5269 scoreval = strtod(c->argv[2]->ptr,NULL);
5270 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5271 }
5272
5273 static void zremCommand(redisClient *c) {
5274 robj *zsetobj;
5275 zset *zs;
5276
5277 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5278 if (zsetobj == NULL) {
5279 addReply(c,shared.czero);
5280 } else {
5281 dictEntry *de;
5282 double *oldscore;
5283 int deleted;
5284
5285 if (zsetobj->type != REDIS_ZSET) {
5286 addReply(c,shared.wrongtypeerr);
5287 return;
5288 }
5289 zs = zsetobj->ptr;
5290 de = dictFind(zs->dict,c->argv[2]);
5291 if (de == NULL) {
5292 addReply(c,shared.czero);
5293 return;
5294 }
5295 /* Delete from the skiplist */
5296 oldscore = dictGetEntryVal(de);
5297 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5298 redisAssert(deleted != 0);
5299
5300 /* Delete from the hash table */
5301 dictDelete(zs->dict,c->argv[2]);
5302 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5303 server.dirty++;
5304 addReply(c,shared.cone);
5305 }
5306 }
5307
5308 static void zremrangebyscoreCommand(redisClient *c) {
5309 double min = strtod(c->argv[2]->ptr,NULL);
5310 double max = strtod(c->argv[3]->ptr,NULL);
5311 robj *zsetobj;
5312 zset *zs;
5313
5314 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5315 if (zsetobj == NULL) {
5316 addReply(c,shared.czero);
5317 } else {
5318 long deleted;
5319
5320 if (zsetobj->type != REDIS_ZSET) {
5321 addReply(c,shared.wrongtypeerr);
5322 return;
5323 }
5324 zs = zsetobj->ptr;
5325 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
5326 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5327 server.dirty += deleted;
5328 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5329 }
5330 }
5331
5332 /* This command merges 2 or more zsets to a destination. When an element
5333 * does not exist in a certain set, score 0 is assumed. The score for an
5334 * element across sets is summed. */
5335 static void zmergeGenericCommand(redisClient *c, int readweights) {
5336 int i, j, zsetnum;
5337 dict **srcdict;
5338 double *weights;
5339 robj *dstkey = c->argv[1], *dstobj;
5340 zset *dst;
5341 dictIterator *di;
5342 dictEntry *de;
5343
5344 zsetnum = c->argc-2;
5345 if (readweights) {
5346 /* force number of arguments to be even */
5347 if (zsetnum % 2 > 0) {
5348 addReplySds(c,sdsnew("-ERR wrong number of arguments for ZMERGEWEIGHED\r\n"));
5349 return;
5350 }
5351 zsetnum /= 2;
5352 }
5353 if (!zsetnum) {
5354 addReply(c,shared.syntaxerr);
5355 return;
5356 }
5357
5358 srcdict = zmalloc(sizeof(dict*) * zsetnum);
5359 weights = zmalloc(sizeof(double) * zsetnum);
5360 for (i = 0; i < zsetnum; i++) {
5361 if (readweights) {
5362 j = 2 + 2*i;
5363 weights[i] = strtod(c->argv[j+1]->ptr, NULL);
5364 } else {
5365 j = 2 + i;
5366 weights[i] = 1.0;
5367 }
5368
5369 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5370 if (!zsetobj) {
5371 srcdict[i] = NULL;
5372 } else {
5373 if (zsetobj->type != REDIS_ZSET) {
5374 zfree(srcdict);
5375 zfree(weights);
5376 addReply(c,shared.wrongtypeerr);
5377 return;
5378 }
5379 srcdict[i] = ((zset*)zsetobj->ptr)->dict;
5380 }
5381 }
5382
5383 dstobj = createZsetObject();
5384 dst = dstobj->ptr;
5385 for (i = 0; i < zsetnum; i++) {
5386 if (!srcdict[i]) continue;
5387
5388 di = dictGetIterator(srcdict[i]);
5389 while((de = dictNext(di)) != NULL) {
5390 /* skip key when already processed */
5391 if (dictFind(dst->dict,dictGetEntryKey(de)) != NULL) continue;
5392
5393 double *score = zmalloc(sizeof(double));
5394 *score = 0.0;
5395 for (j = 0; j < zsetnum; j++) {
5396 if (!srcdict[j]) continue;
5397
5398 dictEntry *other = dictFind(srcdict[j],dictGetEntryKey(de));
5399 if (other) {
5400 *score = *score + weights[j] * (*(double*)dictGetEntryVal(other));
5401 }
5402 }
5403
5404 robj *o = dictGetEntryKey(de);
5405 dictAdd(dst->dict,o,score);
5406 incrRefCount(o); /* added to dictionary */
5407 zslInsert(dst->zsl,*score,o);
5408 incrRefCount(o); /* added to skiplist */
5409 }
5410 dictReleaseIterator(di);
5411 }
5412
5413 deleteKey(c->db,dstkey);
5414 dictAdd(c->db->dict,dstkey,dstobj);
5415 incrRefCount(dstkey);
5416
5417 addReplyLong(c, dst->zsl->length);
5418 server.dirty++;
5419 zfree(srcdict);
5420 zfree(weights);
5421 }
5422
5423 static void zmergeCommand(redisClient *c) {
5424 zmergeGenericCommand(c,0);
5425 }
5426
5427 static void zmergeweighedCommand(redisClient *c) {
5428 zmergeGenericCommand(c,1);
5429 }
5430
5431 static void zrangeGenericCommand(redisClient *c, int reverse) {
5432 robj *o;
5433 int start = atoi(c->argv[2]->ptr);
5434 int end = atoi(c->argv[3]->ptr);
5435 int withscores = 0;
5436
5437 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5438 withscores = 1;
5439 } else if (c->argc >= 5) {
5440 addReply(c,shared.syntaxerr);
5441 return;
5442 }
5443
5444 o = lookupKeyRead(c->db,c->argv[1]);
5445 if (o == NULL) {
5446 addReply(c,shared.nullmultibulk);
5447 } else {
5448 if (o->type != REDIS_ZSET) {
5449 addReply(c,shared.wrongtypeerr);
5450 } else {
5451 zset *zsetobj = o->ptr;
5452 zskiplist *zsl = zsetobj->zsl;
5453 zskiplistNode *ln;
5454
5455 int llen = zsl->length;
5456 int rangelen, j;
5457 robj *ele;
5458
5459 /* convert negative indexes */
5460 if (start < 0) start = llen+start;
5461 if (end < 0) end = llen+end;
5462 if (start < 0) start = 0;
5463 if (end < 0) end = 0;
5464
5465 /* indexes sanity checks */
5466 if (start > end || start >= llen) {
5467 /* Out of range start or start > end result in empty list */
5468 addReply(c,shared.emptymultibulk);
5469 return;
5470 }
5471 if (end >= llen) end = llen-1;
5472 rangelen = (end-start)+1;
5473
5474 /* check if starting point is trivial, before searching
5475 * the element in log(N) time */
5476 if (reverse) {
5477 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen - start);
5478 } else {
5479 ln = start == 0 ? zsl->header->forward[0] : zslGetElementByRank(zsl, start + 1);
5480 }
5481
5482 /* Return the result in form of a multi-bulk reply */
5483 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5484 withscores ? (rangelen*2) : rangelen));
5485 for (j = 0; j < rangelen; j++) {
5486 ele = ln->obj;
5487 addReplyBulkLen(c,ele);
5488 addReply(c,ele);
5489 addReply(c,shared.crlf);
5490 if (withscores)
5491 addReplyDouble(c,ln->score);
5492 ln = reverse ? ln->backward : ln->forward[0];
5493 }
5494 }
5495 }
5496 }
5497
5498 static void zrangeCommand(redisClient *c) {
5499 zrangeGenericCommand(c,0);
5500 }
5501
5502 static void zrevrangeCommand(redisClient *c) {
5503 zrangeGenericCommand(c,1);
5504 }
5505
5506 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5507 * If justcount is non-zero, just the count is returned. */
5508 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5509 robj *o;
5510 double min, max;
5511 int minex = 0, maxex = 0; /* are min or max exclusive? */
5512 int offset = 0, limit = -1;
5513 int withscores = 0;
5514 int badsyntax = 0;
5515
5516 /* Parse the min-max interval. If one of the values is prefixed
5517 * by the "(" character, it's considered "open". For instance
5518 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5519 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5520 if (((char*)c->argv[2]->ptr)[0] == '(') {
5521 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5522 minex = 1;
5523 } else {
5524 min = strtod(c->argv[2]->ptr,NULL);
5525 }
5526 if (((char*)c->argv[3]->ptr)[0] == '(') {
5527 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5528 maxex = 1;
5529 } else {
5530 max = strtod(c->argv[3]->ptr,NULL);
5531 }
5532
5533 /* Parse "WITHSCORES": note that if the command was called with
5534 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5535 * enter the following paths to parse WITHSCORES and LIMIT. */
5536 if (c->argc == 5 || c->argc == 8) {
5537 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5538 withscores = 1;
5539 else
5540 badsyntax = 1;
5541 }
5542 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5543 badsyntax = 1;
5544 if (badsyntax) {
5545 addReplySds(c,
5546 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5547 return;
5548 }
5549
5550 /* Parse "LIMIT" */
5551 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5552 addReply(c,shared.syntaxerr);
5553 return;
5554 } else if (c->argc == (7 + withscores)) {
5555 offset = atoi(c->argv[5]->ptr);
5556 limit = atoi(c->argv[6]->ptr);
5557 if (offset < 0) offset = 0;
5558 }
5559
5560 /* Ok, lookup the key and get the range */
5561 o = lookupKeyRead(c->db,c->argv[1]);
5562 if (o == NULL) {
5563 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5564 } else {
5565 if (o->type != REDIS_ZSET) {
5566 addReply(c,shared.wrongtypeerr);
5567 } else {
5568 zset *zsetobj = o->ptr;
5569 zskiplist *zsl = zsetobj->zsl;
5570 zskiplistNode *ln;
5571 robj *ele, *lenobj = NULL;
5572 unsigned long rangelen = 0;
5573
5574 /* Get the first node with the score >= min, or with
5575 * score > min if 'minex' is true. */
5576 ln = zslFirstWithScore(zsl,min);
5577 while (minex && ln && ln->score == min) ln = ln->forward[0];
5578
5579 if (ln == NULL) {
5580 /* No element matching the speciifed interval */
5581 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5582 return;
5583 }
5584
5585 /* We don't know in advance how many matching elements there
5586 * are in the list, so we push this object that will represent
5587 * the multi-bulk length in the output buffer, and will "fix"
5588 * it later */
5589 if (!justcount) {
5590 lenobj = createObject(REDIS_STRING,NULL);
5591 addReply(c,lenobj);
5592 decrRefCount(lenobj);
5593 }
5594
5595 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5596 if (offset) {
5597 offset--;
5598 ln = ln->forward[0];
5599 continue;
5600 }
5601 if (limit == 0) break;
5602 if (!justcount) {
5603 ele = ln->obj;
5604 addReplyBulkLen(c,ele);
5605 addReply(c,ele);
5606 addReply(c,shared.crlf);
5607 if (withscores)
5608 addReplyDouble(c,ln->score);
5609 }
5610 ln = ln->forward[0];
5611 rangelen++;
5612 if (limit > 0) limit--;
5613 }
5614 if (justcount) {
5615 addReplyLong(c,(long)rangelen);
5616 } else {
5617 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5618 withscores ? (rangelen*2) : rangelen);
5619 }
5620 }
5621 }
5622 }
5623
5624 static void zrangebyscoreCommand(redisClient *c) {
5625 genericZrangebyscoreCommand(c,0);
5626 }
5627
5628 static void zcountCommand(redisClient *c) {
5629 genericZrangebyscoreCommand(c,1);
5630 }
5631
5632 static void zcardCommand(redisClient *c) {
5633 robj *o;
5634 zset *zs;
5635
5636 o = lookupKeyRead(c->db,c->argv[1]);
5637 if (o == NULL) {
5638 addReply(c,shared.czero);
5639 return;
5640 } else {
5641 if (o->type != REDIS_ZSET) {
5642 addReply(c,shared.wrongtypeerr);
5643 } else {
5644 zs = o->ptr;
5645 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
5646 }
5647 }
5648 }
5649
5650 static void zscoreCommand(redisClient *c) {
5651 robj *o;
5652 zset *zs;
5653
5654 o = lookupKeyRead(c->db,c->argv[1]);
5655 if (o == NULL) {
5656 addReply(c,shared.nullbulk);
5657 return;
5658 } else {
5659 if (o->type != REDIS_ZSET) {
5660 addReply(c,shared.wrongtypeerr);
5661 } else {
5662 dictEntry *de;
5663
5664 zs = o->ptr;
5665 de = dictFind(zs->dict,c->argv[2]);
5666 if (!de) {
5667 addReply(c,shared.nullbulk);
5668 } else {
5669 double *score = dictGetEntryVal(de);
5670
5671 addReplyDouble(c,*score);
5672 }
5673 }
5674 }
5675 }
5676
5677 static void zrankCommand(redisClient *c) {
5678 robj *o;
5679 o = lookupKeyRead(c->db,c->argv[1]);
5680 if (o == NULL) {
5681 addReply(c,shared.nullbulk);
5682 return;
5683 }
5684 if (o->type != REDIS_ZSET) {
5685 addReply(c,shared.wrongtypeerr);
5686 } else {
5687 zset *zs = o->ptr;
5688 zskiplist *zsl = zs->zsl;
5689 dictEntry *de;
5690 unsigned long rank;
5691
5692 de = dictFind(zs->dict,c->argv[2]);
5693 if (!de) {
5694 addReply(c,shared.nullbulk);
5695 return;
5696 }
5697
5698 double *score = dictGetEntryVal(de);
5699 rank = zslGetRank(zsl, *score, c->argv[2]);
5700 if (rank) {
5701 addReplyLong(c, rank-1);
5702 } else {
5703 addReply(c,shared.nullbulk);
5704 }
5705 }
5706 }
5707
5708 /* =================================== Hashes =============================== */
5709 static void hsetCommand(redisClient *c) {
5710 int update = 0;
5711 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5712
5713 if (o == NULL) {
5714 o = createHashObject();
5715 dictAdd(c->db->dict,c->argv[1],o);
5716 incrRefCount(c->argv[1]);
5717 } else {
5718 if (o->type != REDIS_HASH) {
5719 addReply(c,shared.wrongtypeerr);
5720 return;
5721 }
5722 }
5723 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5724 unsigned char *zm = o->ptr;
5725
5726 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5727 c->argv[3]->ptr,sdslen(c->argv[3]->ptr),&update);
5728 o->ptr = zm;
5729 } else {
5730 if (dictAdd(o->ptr,c->argv[2],c->argv[3]) == DICT_OK) {
5731 incrRefCount(c->argv[2]);
5732 } else {
5733 update = 1;
5734 }
5735 incrRefCount(c->argv[3]);
5736 }
5737 server.dirty++;
5738 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5739 }
5740
5741 static void hgetCommand(redisClient *c) {
5742 robj *o = lookupKeyRead(c->db,c->argv[1]);
5743
5744 if (o == NULL) {
5745 addReply(c,shared.nullbulk);
5746 return;
5747 } else {
5748 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5749 unsigned char *zm = o->ptr;
5750 unsigned char *val;
5751 unsigned int vlen;
5752
5753 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr), &val,&vlen)) {
5754 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
5755 addReplySds(c,sdsnewlen(val,vlen));
5756 addReply(c,shared.crlf);
5757 return;
5758 } else {
5759 addReply(c,shared.nullbulk);
5760 return;
5761 }
5762 } else {
5763 struct dictEntry *de;
5764
5765 de = dictFind(o->ptr,c->argv[2]);
5766 if (de == NULL) {
5767 addReply(c,shared.nullbulk);
5768 } else {
5769 robj *e = dictGetEntryVal(de);
5770
5771 addReplyBulkLen(c,e);
5772 addReply(c,e);
5773 addReply(c,shared.crlf);
5774 }
5775 }
5776 }
5777 }
5778
5779 /* ========================= Non type-specific commands ==================== */
5780
5781 static void flushdbCommand(redisClient *c) {
5782 server.dirty += dictSize(c->db->dict);
5783 dictEmpty(c->db->dict);
5784 dictEmpty(c->db->expires);
5785 addReply(c,shared.ok);
5786 }
5787
5788 static void flushallCommand(redisClient *c) {
5789 server.dirty += emptyDb();
5790 addReply(c,shared.ok);
5791 rdbSave(server.dbfilename);
5792 server.dirty++;
5793 }
5794
5795 static redisSortOperation *createSortOperation(int type, robj *pattern) {
5796 redisSortOperation *so = zmalloc(sizeof(*so));
5797 so->type = type;
5798 so->pattern = pattern;
5799 return so;
5800 }
5801
5802 /* Return the value associated to the key with a name obtained
5803 * substituting the first occurence of '*' in 'pattern' with 'subst' */
5804 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
5805 char *p;
5806 sds spat, ssub;
5807 robj keyobj;
5808 int prefixlen, sublen, postfixlen;
5809 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5810 struct {
5811 long len;
5812 long free;
5813 char buf[REDIS_SORTKEY_MAX+1];
5814 } keyname;
5815
5816 /* If the pattern is "#" return the substitution object itself in order
5817 * to implement the "SORT ... GET #" feature. */
5818 spat = pattern->ptr;
5819 if (spat[0] == '#' && spat[1] == '\0') {
5820 return subst;
5821 }
5822
5823 /* The substitution object may be specially encoded. If so we create
5824 * a decoded object on the fly. Otherwise getDecodedObject will just
5825 * increment the ref count, that we'll decrement later. */
5826 subst = getDecodedObject(subst);
5827
5828 ssub = subst->ptr;
5829 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5830 p = strchr(spat,'*');
5831 if (!p) {
5832 decrRefCount(subst);
5833 return NULL;
5834 }
5835
5836 prefixlen = p-spat;
5837 sublen = sdslen(ssub);
5838 postfixlen = sdslen(spat)-(prefixlen+1);
5839 memcpy(keyname.buf,spat,prefixlen);
5840 memcpy(keyname.buf+prefixlen,ssub,sublen);
5841 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5842 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5843 keyname.len = prefixlen+sublen+postfixlen;
5844
5845 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
5846 decrRefCount(subst);
5847
5848 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
5849 return lookupKeyRead(db,&keyobj);
5850 }
5851
5852 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5853 * the additional parameter is not standard but a BSD-specific we have to
5854 * pass sorting parameters via the global 'server' structure */
5855 static int sortCompare(const void *s1, const void *s2) {
5856 const redisSortObject *so1 = s1, *so2 = s2;
5857 int cmp;
5858
5859 if (!server.sort_alpha) {
5860 /* Numeric sorting. Here it's trivial as we precomputed scores */
5861 if (so1->u.score > so2->u.score) {
5862 cmp = 1;
5863 } else if (so1->u.score < so2->u.score) {
5864 cmp = -1;
5865 } else {
5866 cmp = 0;
5867 }
5868 } else {
5869 /* Alphanumeric sorting */
5870 if (server.sort_bypattern) {
5871 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5872 /* At least one compare object is NULL */
5873 if (so1->u.cmpobj == so2->u.cmpobj)
5874 cmp = 0;
5875 else if (so1->u.cmpobj == NULL)
5876 cmp = -1;
5877 else
5878 cmp = 1;
5879 } else {
5880 /* We have both the objects, use strcoll */
5881 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5882 }
5883 } else {
5884 /* Compare elements directly */
5885 robj *dec1, *dec2;
5886
5887 dec1 = getDecodedObject(so1->obj);
5888 dec2 = getDecodedObject(so2->obj);
5889 cmp = strcoll(dec1->ptr,dec2->ptr);
5890 decrRefCount(dec1);
5891 decrRefCount(dec2);
5892 }
5893 }
5894 return server.sort_desc ? -cmp : cmp;
5895 }
5896
5897 /* The SORT command is the most complex command in Redis. Warning: this code
5898 * is optimized for speed and a bit less for readability */
5899 static void sortCommand(redisClient *c) {
5900 list *operations;
5901 int outputlen = 0;
5902 int desc = 0, alpha = 0;
5903 int limit_start = 0, limit_count = -1, start, end;
5904 int j, dontsort = 0, vectorlen;
5905 int getop = 0; /* GET operation counter */
5906 robj *sortval, *sortby = NULL, *storekey = NULL;
5907 redisSortObject *vector; /* Resulting vector to sort */
5908
5909 /* Lookup the key to sort. It must be of the right types */
5910 sortval = lookupKeyRead(c->db,c->argv[1]);
5911 if (sortval == NULL) {
5912 addReply(c,shared.nullmultibulk);
5913 return;
5914 }
5915 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5916 sortval->type != REDIS_ZSET)
5917 {
5918 addReply(c,shared.wrongtypeerr);
5919 return;
5920 }
5921
5922 /* Create a list of operations to perform for every sorted element.
5923 * Operations can be GET/DEL/INCR/DECR */
5924 operations = listCreate();
5925 listSetFreeMethod(operations,zfree);
5926 j = 2;
5927
5928 /* Now we need to protect sortval incrementing its count, in the future
5929 * SORT may have options able to overwrite/delete keys during the sorting
5930 * and the sorted key itself may get destroied */
5931 incrRefCount(sortval);
5932
5933 /* The SORT command has an SQL-alike syntax, parse it */
5934 while(j < c->argc) {
5935 int leftargs = c->argc-j-1;
5936 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5937 desc = 0;
5938 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5939 desc = 1;
5940 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5941 alpha = 1;
5942 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5943 limit_start = atoi(c->argv[j+1]->ptr);
5944 limit_count = atoi(c->argv[j+2]->ptr);
5945 j+=2;
5946 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5947 storekey = c->argv[j+1];
5948 j++;
5949 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5950 sortby = c->argv[j+1];
5951 /* If the BY pattern does not contain '*', i.e. it is constant,
5952 * we don't need to sort nor to lookup the weight keys. */
5953 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5954 j++;
5955 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5956 listAddNodeTail(operations,createSortOperation(
5957 REDIS_SORT_GET,c->argv[j+1]));
5958 getop++;
5959 j++;
5960 } else {
5961 decrRefCount(sortval);
5962 listRelease(operations);
5963 addReply(c,shared.syntaxerr);
5964 return;
5965 }
5966 j++;
5967 }
5968
5969 /* Load the sorting vector with all the objects to sort */
5970 switch(sortval->type) {
5971 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5972 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5973 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
5974 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
5975 }
5976 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
5977 j = 0;
5978
5979 if (sortval->type == REDIS_LIST) {
5980 list *list = sortval->ptr;
5981 listNode *ln;
5982 listIter li;
5983
5984 listRewind(list,&li);
5985 while((ln = listNext(&li))) {
5986 robj *ele = ln->value;
5987 vector[j].obj = ele;
5988 vector[j].u.score = 0;
5989 vector[j].u.cmpobj = NULL;
5990 j++;
5991 }
5992 } else {
5993 dict *set;
5994 dictIterator *di;
5995 dictEntry *setele;
5996
5997 if (sortval->type == REDIS_SET) {
5998 set = sortval->ptr;
5999 } else {
6000 zset *zs = sortval->ptr;
6001 set = zs->dict;
6002 }
6003
6004 di = dictGetIterator(set);
6005 while((setele = dictNext(di)) != NULL) {
6006 vector[j].obj = dictGetEntryKey(setele);
6007 vector[j].u.score = 0;
6008 vector[j].u.cmpobj = NULL;
6009 j++;
6010 }
6011 dictReleaseIterator(di);
6012 }
6013 redisAssert(j == vectorlen);
6014
6015 /* Now it's time to load the right scores in the sorting vector */
6016 if (dontsort == 0) {
6017 for (j = 0; j < vectorlen; j++) {
6018 if (sortby) {
6019 robj *byval;
6020
6021 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6022 if (!byval || byval->type != REDIS_STRING) continue;
6023 if (alpha) {
6024 vector[j].u.cmpobj = getDecodedObject(byval);
6025 } else {
6026 if (byval->encoding == REDIS_ENCODING_RAW) {
6027 vector[j].u.score = strtod(byval->ptr,NULL);
6028 } else {
6029 /* Don't need to decode the object if it's
6030 * integer-encoded (the only encoding supported) so
6031 * far. We can just cast it */
6032 if (byval->encoding == REDIS_ENCODING_INT) {
6033 vector[j].u.score = (long)byval->ptr;
6034 } else
6035 redisAssert(1 != 1);
6036 }
6037 }
6038 } else {
6039 if (!alpha) {
6040 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6041 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6042 else {
6043 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6044 vector[j].u.score = (long) vector[j].obj->ptr;
6045 else
6046 redisAssert(1 != 1);
6047 }
6048 }
6049 }
6050 }
6051 }
6052
6053 /* We are ready to sort the vector... perform a bit of sanity check
6054 * on the LIMIT option too. We'll use a partial version of quicksort. */
6055 start = (limit_start < 0) ? 0 : limit_start;
6056 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6057 if (start >= vectorlen) {
6058 start = vectorlen-1;
6059 end = vectorlen-2;
6060 }
6061 if (end >= vectorlen) end = vectorlen-1;
6062
6063 if (dontsort == 0) {
6064 server.sort_desc = desc;
6065 server.sort_alpha = alpha;
6066 server.sort_bypattern = sortby ? 1 : 0;
6067 if (sortby && (start != 0 || end != vectorlen-1))
6068 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6069 else
6070 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6071 }
6072
6073 /* Send command output to the output buffer, performing the specified
6074 * GET/DEL/INCR/DECR operations if any. */
6075 outputlen = getop ? getop*(end-start+1) : end-start+1;
6076 if (storekey == NULL) {
6077 /* STORE option not specified, sent the sorting result to client */
6078 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6079 for (j = start; j <= end; j++) {
6080 listNode *ln;
6081 listIter li;
6082
6083 if (!getop) {
6084 addReplyBulkLen(c,vector[j].obj);
6085 addReply(c,vector[j].obj);
6086 addReply(c,shared.crlf);
6087 }
6088 listRewind(operations,&li);
6089 while((ln = listNext(&li))) {
6090 redisSortOperation *sop = ln->value;
6091 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6092 vector[j].obj);
6093
6094 if (sop->type == REDIS_SORT_GET) {
6095 if (!val || val->type != REDIS_STRING) {
6096 addReply(c,shared.nullbulk);
6097 } else {
6098 addReplyBulkLen(c,val);
6099 addReply(c,val);
6100 addReply(c,shared.crlf);
6101 }
6102 } else {
6103 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6104 }
6105 }
6106 }
6107 } else {
6108 robj *listObject = createListObject();
6109 list *listPtr = (list*) listObject->ptr;
6110
6111 /* STORE option specified, set the sorting result as a List object */
6112 for (j = start; j <= end; j++) {
6113 listNode *ln;
6114 listIter li;
6115
6116 if (!getop) {
6117 listAddNodeTail(listPtr,vector[j].obj);
6118 incrRefCount(vector[j].obj);
6119 }
6120 listRewind(operations,&li);
6121 while((ln = listNext(&li))) {
6122 redisSortOperation *sop = ln->value;
6123 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6124 vector[j].obj);
6125
6126 if (sop->type == REDIS_SORT_GET) {
6127 if (!val || val->type != REDIS_STRING) {
6128 listAddNodeTail(listPtr,createStringObject("",0));
6129 } else {
6130 listAddNodeTail(listPtr,val);
6131 incrRefCount(val);
6132 }
6133 } else {
6134 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6135 }
6136 }
6137 }
6138 if (dictReplace(c->db->dict,storekey,listObject)) {
6139 incrRefCount(storekey);
6140 }
6141 /* Note: we add 1 because the DB is dirty anyway since even if the
6142 * SORT result is empty a new key is set and maybe the old content
6143 * replaced. */
6144 server.dirty += 1+outputlen;
6145 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6146 }
6147
6148 /* Cleanup */
6149 decrRefCount(sortval);
6150 listRelease(operations);
6151 for (j = 0; j < vectorlen; j++) {
6152 if (sortby && alpha && vector[j].u.cmpobj)
6153 decrRefCount(vector[j].u.cmpobj);
6154 }
6155 zfree(vector);
6156 }
6157
6158 /* Convert an amount of bytes into a human readable string in the form
6159 * of 100B, 2G, 100M, 4K, and so forth. */
6160 static void bytesToHuman(char *s, unsigned long long n) {
6161 double d;
6162
6163 if (n < 1024) {
6164 /* Bytes */
6165 sprintf(s,"%lluB",n);
6166 return;
6167 } else if (n < (1024*1024)) {
6168 d = (double)n/(1024);
6169 sprintf(s,"%.2fK",d);
6170 } else if (n < (1024LL*1024*1024)) {
6171 d = (double)n/(1024*1024);
6172 sprintf(s,"%.2fM",d);
6173 } else if (n < (1024LL*1024*1024*1024)) {
6174 d = (double)n/(1024LL*1024*1024);
6175 sprintf(s,"%.2fG",d);
6176 }
6177 }
6178
6179 /* Create the string returned by the INFO command. This is decoupled
6180 * by the INFO command itself as we need to report the same information
6181 * on memory corruption problems. */
6182 static sds genRedisInfoString(void) {
6183 sds info;
6184 time_t uptime = time(NULL)-server.stat_starttime;
6185 int j;
6186 char hmem[64];
6187
6188 bytesToHuman(hmem,zmalloc_used_memory());
6189 info = sdscatprintf(sdsempty(),
6190 "redis_version:%s\r\n"
6191 "arch_bits:%s\r\n"
6192 "multiplexing_api:%s\r\n"
6193 "process_id:%ld\r\n"
6194 "uptime_in_seconds:%ld\r\n"
6195 "uptime_in_days:%ld\r\n"
6196 "connected_clients:%d\r\n"
6197 "connected_slaves:%d\r\n"
6198 "blocked_clients:%d\r\n"
6199 "used_memory:%zu\r\n"
6200 "used_memory_human:%s\r\n"
6201 "changes_since_last_save:%lld\r\n"
6202 "bgsave_in_progress:%d\r\n"
6203 "last_save_time:%ld\r\n"
6204 "bgrewriteaof_in_progress:%d\r\n"
6205 "total_connections_received:%lld\r\n"
6206 "total_commands_processed:%lld\r\n"
6207 "vm_enabled:%d\r\n"
6208 "role:%s\r\n"
6209 ,REDIS_VERSION,
6210 (sizeof(long) == 8) ? "64" : "32",
6211 aeGetApiName(),
6212 (long) getpid(),
6213 uptime,
6214 uptime/(3600*24),
6215 listLength(server.clients)-listLength(server.slaves),
6216 listLength(server.slaves),
6217 server.blpop_blocked_clients,
6218 zmalloc_used_memory(),
6219 hmem,
6220 server.dirty,
6221 server.bgsavechildpid != -1,
6222 server.lastsave,
6223 server.bgrewritechildpid != -1,
6224 server.stat_numconnections,
6225 server.stat_numcommands,
6226 server.vm_enabled != 0,
6227 server.masterhost == NULL ? "master" : "slave"
6228 );
6229 if (server.masterhost) {
6230 info = sdscatprintf(info,
6231 "master_host:%s\r\n"
6232 "master_port:%d\r\n"
6233 "master_link_status:%s\r\n"
6234 "master_last_io_seconds_ago:%d\r\n"
6235 ,server.masterhost,
6236 server.masterport,
6237 (server.replstate == REDIS_REPL_CONNECTED) ?
6238 "up" : "down",
6239 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6240 );
6241 }
6242 if (server.vm_enabled) {
6243 lockThreadedIO();
6244 info = sdscatprintf(info,
6245 "vm_conf_max_memory:%llu\r\n"
6246 "vm_conf_page_size:%llu\r\n"
6247 "vm_conf_pages:%llu\r\n"
6248 "vm_stats_used_pages:%llu\r\n"
6249 "vm_stats_swapped_objects:%llu\r\n"
6250 "vm_stats_swappin_count:%llu\r\n"
6251 "vm_stats_swappout_count:%llu\r\n"
6252 "vm_stats_io_newjobs_len:%lu\r\n"
6253 "vm_stats_io_processing_len:%lu\r\n"
6254 "vm_stats_io_processed_len:%lu\r\n"
6255 "vm_stats_io_active_threads:%lu\r\n"
6256 "vm_stats_blocked_clients:%lu\r\n"
6257 ,(unsigned long long) server.vm_max_memory,
6258 (unsigned long long) server.vm_page_size,
6259 (unsigned long long) server.vm_pages,
6260 (unsigned long long) server.vm_stats_used_pages,
6261 (unsigned long long) server.vm_stats_swapped_objects,
6262 (unsigned long long) server.vm_stats_swapins,
6263 (unsigned long long) server.vm_stats_swapouts,
6264 (unsigned long) listLength(server.io_newjobs),
6265 (unsigned long) listLength(server.io_processing),
6266 (unsigned long) listLength(server.io_processed),
6267 (unsigned long) server.io_active_threads,
6268 (unsigned long) server.vm_blocked_clients
6269 );
6270 unlockThreadedIO();
6271 }
6272 for (j = 0; j < server.dbnum; j++) {
6273 long long keys, vkeys;
6274
6275 keys = dictSize(server.db[j].dict);
6276 vkeys = dictSize(server.db[j].expires);
6277 if (keys || vkeys) {
6278 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6279 j, keys, vkeys);
6280 }
6281 }
6282 return info;
6283 }
6284
6285 static void infoCommand(redisClient *c) {
6286 sds info = genRedisInfoString();
6287 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6288 (unsigned long)sdslen(info)));
6289 addReplySds(c,info);
6290 addReply(c,shared.crlf);
6291 }
6292
6293 static void monitorCommand(redisClient *c) {
6294 /* ignore MONITOR if aleady slave or in monitor mode */
6295 if (c->flags & REDIS_SLAVE) return;
6296
6297 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6298 c->slaveseldb = 0;
6299 listAddNodeTail(server.monitors,c);
6300 addReply(c,shared.ok);
6301 }
6302
6303 /* ================================= Expire ================================= */
6304 static int removeExpire(redisDb *db, robj *key) {
6305 if (dictDelete(db->expires,key) == DICT_OK) {
6306 return 1;
6307 } else {
6308 return 0;
6309 }
6310 }
6311
6312 static int setExpire(redisDb *db, robj *key, time_t when) {
6313 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6314 return 0;
6315 } else {
6316 incrRefCount(key);
6317 return 1;
6318 }
6319 }
6320
6321 /* Return the expire time of the specified key, or -1 if no expire
6322 * is associated with this key (i.e. the key is non volatile) */
6323 static time_t getExpire(redisDb *db, robj *key) {
6324 dictEntry *de;
6325
6326 /* No expire? return ASAP */
6327 if (dictSize(db->expires) == 0 ||
6328 (de = dictFind(db->expires,key)) == NULL) return -1;
6329
6330 return (time_t) dictGetEntryVal(de);
6331 }
6332
6333 static int expireIfNeeded(redisDb *db, robj *key) {
6334 time_t when;
6335 dictEntry *de;
6336
6337 /* No expire? return ASAP */
6338 if (dictSize(db->expires) == 0 ||
6339 (de = dictFind(db->expires,key)) == NULL) return 0;
6340
6341 /* Lookup the expire */
6342 when = (time_t) dictGetEntryVal(de);
6343 if (time(NULL) <= when) return 0;
6344
6345 /* Delete the key */
6346 dictDelete(db->expires,key);
6347 return dictDelete(db->dict,key) == DICT_OK;
6348 }
6349
6350 static int deleteIfVolatile(redisDb *db, robj *key) {
6351 dictEntry *de;
6352
6353 /* No expire? return ASAP */
6354 if (dictSize(db->expires) == 0 ||
6355 (de = dictFind(db->expires,key)) == NULL) return 0;
6356
6357 /* Delete the key */
6358 server.dirty++;
6359 dictDelete(db->expires,key);
6360 return dictDelete(db->dict,key) == DICT_OK;
6361 }
6362
6363 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6364 dictEntry *de;
6365
6366 de = dictFind(c->db->dict,key);
6367 if (de == NULL) {
6368 addReply(c,shared.czero);
6369 return;
6370 }
6371 if (seconds < 0) {
6372 if (deleteKey(c->db,key)) server.dirty++;
6373 addReply(c, shared.cone);
6374 return;
6375 } else {
6376 time_t when = time(NULL)+seconds;
6377 if (setExpire(c->db,key,when)) {
6378 addReply(c,shared.cone);
6379 server.dirty++;
6380 } else {
6381 addReply(c,shared.czero);
6382 }
6383 return;
6384 }
6385 }
6386
6387 static void expireCommand(redisClient *c) {
6388 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6389 }
6390
6391 static void expireatCommand(redisClient *c) {
6392 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6393 }
6394
6395 static void ttlCommand(redisClient *c) {
6396 time_t expire;
6397 int ttl = -1;
6398
6399 expire = getExpire(c->db,c->argv[1]);
6400 if (expire != -1) {
6401 ttl = (int) (expire-time(NULL));
6402 if (ttl < 0) ttl = -1;
6403 }
6404 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6405 }
6406
6407 /* ================================ MULTI/EXEC ============================== */
6408
6409 /* Client state initialization for MULTI/EXEC */
6410 static void initClientMultiState(redisClient *c) {
6411 c->mstate.commands = NULL;
6412 c->mstate.count = 0;
6413 }
6414
6415 /* Release all the resources associated with MULTI/EXEC state */
6416 static void freeClientMultiState(redisClient *c) {
6417 int j;
6418
6419 for (j = 0; j < c->mstate.count; j++) {
6420 int i;
6421 multiCmd *mc = c->mstate.commands+j;
6422
6423 for (i = 0; i < mc->argc; i++)
6424 decrRefCount(mc->argv[i]);
6425 zfree(mc->argv);
6426 }
6427 zfree(c->mstate.commands);
6428 }
6429
6430 /* Add a new command into the MULTI commands queue */
6431 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6432 multiCmd *mc;
6433 int j;
6434
6435 c->mstate.commands = zrealloc(c->mstate.commands,
6436 sizeof(multiCmd)*(c->mstate.count+1));
6437 mc = c->mstate.commands+c->mstate.count;
6438 mc->cmd = cmd;
6439 mc->argc = c->argc;
6440 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6441 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6442 for (j = 0; j < c->argc; j++)
6443 incrRefCount(mc->argv[j]);
6444 c->mstate.count++;
6445 }
6446
6447 static void multiCommand(redisClient *c) {
6448 c->flags |= REDIS_MULTI;
6449 addReply(c,shared.ok);
6450 }
6451
6452 static void discardCommand(redisClient *c) {
6453 if (!(c->flags & REDIS_MULTI)) {
6454 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6455 return;
6456 }
6457
6458 freeClientMultiState(c);
6459 initClientMultiState(c);
6460 c->flags &= (~REDIS_MULTI);
6461 addReply(c,shared.ok);
6462 }
6463
6464 static void execCommand(redisClient *c) {
6465 int j;
6466 robj **orig_argv;
6467 int orig_argc;
6468
6469 if (!(c->flags & REDIS_MULTI)) {
6470 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6471 return;
6472 }
6473
6474 orig_argv = c->argv;
6475 orig_argc = c->argc;
6476 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6477 for (j = 0; j < c->mstate.count; j++) {
6478 c->argc = c->mstate.commands[j].argc;
6479 c->argv = c->mstate.commands[j].argv;
6480 call(c,c->mstate.commands[j].cmd);
6481 }
6482 c->argv = orig_argv;
6483 c->argc = orig_argc;
6484 freeClientMultiState(c);
6485 initClientMultiState(c);
6486 c->flags &= (~REDIS_MULTI);
6487 }
6488
6489 /* =========================== Blocking Operations ========================= */
6490
6491 /* Currently Redis blocking operations support is limited to list POP ops,
6492 * so the current implementation is not fully generic, but it is also not
6493 * completely specific so it will not require a rewrite to support new
6494 * kind of blocking operations in the future.
6495 *
6496 * Still it's important to note that list blocking operations can be already
6497 * used as a notification mechanism in order to implement other blocking
6498 * operations at application level, so there must be a very strong evidence
6499 * of usefulness and generality before new blocking operations are implemented.
6500 *
6501 * This is how the current blocking POP works, we use BLPOP as example:
6502 * - If the user calls BLPOP and the key exists and contains a non empty list
6503 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6504 * if there is not to block.
6505 * - If instead BLPOP is called and the key does not exists or the list is
6506 * empty we need to block. In order to do so we remove the notification for
6507 * new data to read in the client socket (so that we'll not serve new
6508 * requests if the blocking request is not served). Also we put the client
6509 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6510 * blocking for this keys.
6511 * - If a PUSH operation against a key with blocked clients waiting is
6512 * performed, we serve the first in the list: basically instead to push
6513 * the new element inside the list we return it to the (first / oldest)
6514 * blocking client, unblock the client, and remove it form the list.
6515 *
6516 * The above comment and the source code should be enough in order to understand
6517 * the implementation and modify / fix it later.
6518 */
6519
6520 /* Set a client in blocking mode for the specified key, with the specified
6521 * timeout */
6522 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6523 dictEntry *de;
6524 list *l;
6525 int j;
6526
6527 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6528 c->blockingkeysnum = numkeys;
6529 c->blockingto = timeout;
6530 for (j = 0; j < numkeys; j++) {
6531 /* Add the key in the client structure, to map clients -> keys */
6532 c->blockingkeys[j] = keys[j];
6533 incrRefCount(keys[j]);
6534
6535 /* And in the other "side", to map keys -> clients */
6536 de = dictFind(c->db->blockingkeys,keys[j]);
6537 if (de == NULL) {
6538 int retval;
6539
6540 /* For every key we take a list of clients blocked for it */
6541 l = listCreate();
6542 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6543 incrRefCount(keys[j]);
6544 assert(retval == DICT_OK);
6545 } else {
6546 l = dictGetEntryVal(de);
6547 }
6548 listAddNodeTail(l,c);
6549 }
6550 /* Mark the client as a blocked client */
6551 c->flags |= REDIS_BLOCKED;
6552 server.blpop_blocked_clients++;
6553 }
6554
6555 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6556 static void unblockClientWaitingData(redisClient *c) {
6557 dictEntry *de;
6558 list *l;
6559 int j;
6560
6561 assert(c->blockingkeys != NULL);
6562 /* The client may wait for multiple keys, so unblock it for every key. */
6563 for (j = 0; j < c->blockingkeysnum; j++) {
6564 /* Remove this client from the list of clients waiting for this key. */
6565 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6566 assert(de != NULL);
6567 l = dictGetEntryVal(de);
6568 listDelNode(l,listSearchKey(l,c));
6569 /* If the list is empty we need to remove it to avoid wasting memory */
6570 if (listLength(l) == 0)
6571 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6572 decrRefCount(c->blockingkeys[j]);
6573 }
6574 /* Cleanup the client structure */
6575 zfree(c->blockingkeys);
6576 c->blockingkeys = NULL;
6577 c->flags &= (~REDIS_BLOCKED);
6578 server.blpop_blocked_clients--;
6579 /* We want to process data if there is some command waiting
6580 * in the input buffer. Note that this is safe even if
6581 * unblockClientWaitingData() gets called from freeClient() because
6582 * freeClient() will be smart enough to call this function
6583 * *after* c->querybuf was set to NULL. */
6584 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6585 }
6586
6587 /* This should be called from any function PUSHing into lists.
6588 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6589 * 'ele' is the element pushed.
6590 *
6591 * If the function returns 0 there was no client waiting for a list push
6592 * against this key.
6593 *
6594 * If the function returns 1 there was a client waiting for a list push
6595 * against this key, the element was passed to this client thus it's not
6596 * needed to actually add it to the list and the caller should return asap. */
6597 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6598 struct dictEntry *de;
6599 redisClient *receiver;
6600 list *l;
6601 listNode *ln;
6602
6603 de = dictFind(c->db->blockingkeys,key);
6604 if (de == NULL) return 0;
6605 l = dictGetEntryVal(de);
6606 ln = listFirst(l);
6607 assert(ln != NULL);
6608 receiver = ln->value;
6609
6610 addReplySds(receiver,sdsnew("*2\r\n"));
6611 addReplyBulkLen(receiver,key);
6612 addReply(receiver,key);
6613 addReply(receiver,shared.crlf);
6614 addReplyBulkLen(receiver,ele);
6615 addReply(receiver,ele);
6616 addReply(receiver,shared.crlf);
6617 unblockClientWaitingData(receiver);
6618 return 1;
6619 }
6620
6621 /* Blocking RPOP/LPOP */
6622 static void blockingPopGenericCommand(redisClient *c, int where) {
6623 robj *o;
6624 time_t timeout;
6625 int j;
6626
6627 for (j = 1; j < c->argc-1; j++) {
6628 o = lookupKeyWrite(c->db,c->argv[j]);
6629 if (o != NULL) {
6630 if (o->type != REDIS_LIST) {
6631 addReply(c,shared.wrongtypeerr);
6632 return;
6633 } else {
6634 list *list = o->ptr;
6635 if (listLength(list) != 0) {
6636 /* If the list contains elements fall back to the usual
6637 * non-blocking POP operation */
6638 robj *argv[2], **orig_argv;
6639 int orig_argc;
6640
6641 /* We need to alter the command arguments before to call
6642 * popGenericCommand() as the command takes a single key. */
6643 orig_argv = c->argv;
6644 orig_argc = c->argc;
6645 argv[1] = c->argv[j];
6646 c->argv = argv;
6647 c->argc = 2;
6648
6649 /* Also the return value is different, we need to output
6650 * the multi bulk reply header and the key name. The
6651 * "real" command will add the last element (the value)
6652 * for us. If this souds like an hack to you it's just
6653 * because it is... */
6654 addReplySds(c,sdsnew("*2\r\n"));
6655 addReplyBulkLen(c,argv[1]);
6656 addReply(c,argv[1]);
6657 addReply(c,shared.crlf);
6658 popGenericCommand(c,where);
6659
6660 /* Fix the client structure with the original stuff */
6661 c->argv = orig_argv;
6662 c->argc = orig_argc;
6663 return;
6664 }
6665 }
6666 }
6667 }
6668 /* If the list is empty or the key does not exists we must block */
6669 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
6670 if (timeout > 0) timeout += time(NULL);
6671 blockForKeys(c,c->argv+1,c->argc-2,timeout);
6672 }
6673
6674 static void blpopCommand(redisClient *c) {
6675 blockingPopGenericCommand(c,REDIS_HEAD);
6676 }
6677
6678 static void brpopCommand(redisClient *c) {
6679 blockingPopGenericCommand(c,REDIS_TAIL);
6680 }
6681
6682 /* =============================== Replication ============================= */
6683
6684 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
6685 ssize_t nwritten, ret = size;
6686 time_t start = time(NULL);
6687
6688 timeout++;
6689 while(size) {
6690 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6691 nwritten = write(fd,ptr,size);
6692 if (nwritten == -1) return -1;
6693 ptr += nwritten;
6694 size -= nwritten;
6695 }
6696 if ((time(NULL)-start) > timeout) {
6697 errno = ETIMEDOUT;
6698 return -1;
6699 }
6700 }
6701 return ret;
6702 }
6703
6704 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
6705 ssize_t nread, totread = 0;
6706 time_t start = time(NULL);
6707
6708 timeout++;
6709 while(size) {
6710 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6711 nread = read(fd,ptr,size);
6712 if (nread == -1) return -1;
6713 ptr += nread;
6714 size -= nread;
6715 totread += nread;
6716 }
6717 if ((time(NULL)-start) > timeout) {
6718 errno = ETIMEDOUT;
6719 return -1;
6720 }
6721 }
6722 return totread;
6723 }
6724
6725 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6726 ssize_t nread = 0;
6727
6728 size--;
6729 while(size) {
6730 char c;
6731
6732 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6733 if (c == '\n') {
6734 *ptr = '\0';
6735 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6736 return nread;
6737 } else {
6738 *ptr++ = c;
6739 *ptr = '\0';
6740 nread++;
6741 }
6742 }
6743 return nread;
6744 }
6745
6746 static void syncCommand(redisClient *c) {
6747 /* ignore SYNC if aleady slave or in monitor mode */
6748 if (c->flags & REDIS_SLAVE) return;
6749
6750 /* SYNC can't be issued when the server has pending data to send to
6751 * the client about already issued commands. We need a fresh reply
6752 * buffer registering the differences between the BGSAVE and the current
6753 * dataset, so that we can copy to other slaves if needed. */
6754 if (listLength(c->reply) != 0) {
6755 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6756 return;
6757 }
6758
6759 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6760 /* Here we need to check if there is a background saving operation
6761 * in progress, or if it is required to start one */
6762 if (server.bgsavechildpid != -1) {
6763 /* Ok a background save is in progress. Let's check if it is a good
6764 * one for replication, i.e. if there is another slave that is
6765 * registering differences since the server forked to save */
6766 redisClient *slave;
6767 listNode *ln;
6768 listIter li;
6769
6770 listRewind(server.slaves,&li);
6771 while((ln = listNext(&li))) {
6772 slave = ln->value;
6773 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
6774 }
6775 if (ln) {
6776 /* Perfect, the server is already registering differences for
6777 * another slave. Set the right state, and copy the buffer. */
6778 listRelease(c->reply);
6779 c->reply = listDup(slave->reply);
6780 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6781 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6782 } else {
6783 /* No way, we need to wait for the next BGSAVE in order to
6784 * register differences */
6785 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6786 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6787 }
6788 } else {
6789 /* Ok we don't have a BGSAVE in progress, let's start one */
6790 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6791 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6792 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6793 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6794 return;
6795 }
6796 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6797 }
6798 c->repldbfd = -1;
6799 c->flags |= REDIS_SLAVE;
6800 c->slaveseldb = 0;
6801 listAddNodeTail(server.slaves,c);
6802 return;
6803 }
6804
6805 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6806 redisClient *slave = privdata;
6807 REDIS_NOTUSED(el);
6808 REDIS_NOTUSED(mask);
6809 char buf[REDIS_IOBUF_LEN];
6810 ssize_t nwritten, buflen;
6811
6812 if (slave->repldboff == 0) {
6813 /* Write the bulk write count before to transfer the DB. In theory here
6814 * we don't know how much room there is in the output buffer of the
6815 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6816 * operations) will never be smaller than the few bytes we need. */
6817 sds bulkcount;
6818
6819 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6820 slave->repldbsize);
6821 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6822 {
6823 sdsfree(bulkcount);
6824 freeClient(slave);
6825 return;
6826 }
6827 sdsfree(bulkcount);
6828 }
6829 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6830 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6831 if (buflen <= 0) {
6832 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6833 (buflen == 0) ? "premature EOF" : strerror(errno));
6834 freeClient(slave);
6835 return;
6836 }
6837 if ((nwritten = write(fd,buf,buflen)) == -1) {
6838 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6839 strerror(errno));
6840 freeClient(slave);
6841 return;
6842 }
6843 slave->repldboff += nwritten;
6844 if (slave->repldboff == slave->repldbsize) {
6845 close(slave->repldbfd);
6846 slave->repldbfd = -1;
6847 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6848 slave->replstate = REDIS_REPL_ONLINE;
6849 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
6850 sendReplyToClient, slave) == AE_ERR) {
6851 freeClient(slave);
6852 return;
6853 }
6854 addReplySds(slave,sdsempty());
6855 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6856 }
6857 }
6858
6859 /* This function is called at the end of every backgrond saving.
6860 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6861 * otherwise REDIS_ERR is passed to the function.
6862 *
6863 * The goal of this function is to handle slaves waiting for a successful
6864 * background saving in order to perform non-blocking synchronization. */
6865 static void updateSlavesWaitingBgsave(int bgsaveerr) {
6866 listNode *ln;
6867 int startbgsave = 0;
6868 listIter li;
6869
6870 listRewind(server.slaves,&li);
6871 while((ln = listNext(&li))) {
6872 redisClient *slave = ln->value;
6873
6874 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6875 startbgsave = 1;
6876 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6877 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
6878 struct redis_stat buf;
6879
6880 if (bgsaveerr != REDIS_OK) {
6881 freeClient(slave);
6882 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6883 continue;
6884 }
6885 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
6886 redis_fstat(slave->repldbfd,&buf) == -1) {
6887 freeClient(slave);
6888 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6889 continue;
6890 }
6891 slave->repldboff = 0;
6892 slave->repldbsize = buf.st_size;
6893 slave->replstate = REDIS_REPL_SEND_BULK;
6894 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6895 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6896 freeClient(slave);
6897 continue;
6898 }
6899 }
6900 }
6901 if (startbgsave) {
6902 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6903 listIter li;
6904
6905 listRewind(server.slaves,&li);
6906 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
6907 while((ln = listNext(&li))) {
6908 redisClient *slave = ln->value;
6909
6910 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6911 freeClient(slave);
6912 }
6913 }
6914 }
6915 }
6916
6917 static int syncWithMaster(void) {
6918 char buf[1024], tmpfile[256], authcmd[1024];
6919 long dumpsize;
6920 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6921 int dfd;
6922
6923 if (fd == -1) {
6924 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6925 strerror(errno));
6926 return REDIS_ERR;
6927 }
6928
6929 /* AUTH with the master if required. */
6930 if(server.masterauth) {
6931 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6932 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6933 close(fd);
6934 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6935 strerror(errno));
6936 return REDIS_ERR;
6937 }
6938 /* Read the AUTH result. */
6939 if (syncReadLine(fd,buf,1024,3600) == -1) {
6940 close(fd);
6941 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6942 strerror(errno));
6943 return REDIS_ERR;
6944 }
6945 if (buf[0] != '+') {
6946 close(fd);
6947 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6948 return REDIS_ERR;
6949 }
6950 }
6951
6952 /* Issue the SYNC command */
6953 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6954 close(fd);
6955 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6956 strerror(errno));
6957 return REDIS_ERR;
6958 }
6959 /* Read the bulk write count */
6960 if (syncReadLine(fd,buf,1024,3600) == -1) {
6961 close(fd);
6962 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6963 strerror(errno));
6964 return REDIS_ERR;
6965 }
6966 if (buf[0] != '$') {
6967 close(fd);
6968 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6969 return REDIS_ERR;
6970 }
6971 dumpsize = strtol(buf+1,NULL,10);
6972 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
6973 /* Read the bulk write data on a temp file */
6974 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6975 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6976 if (dfd == -1) {
6977 close(fd);
6978 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6979 return REDIS_ERR;
6980 }
6981 while(dumpsize) {
6982 int nread, nwritten;
6983
6984 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6985 if (nread == -1) {
6986 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6987 strerror(errno));
6988 close(fd);
6989 close(dfd);
6990 return REDIS_ERR;
6991 }
6992 nwritten = write(dfd,buf,nread);
6993 if (nwritten == -1) {
6994 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6995 close(fd);
6996 close(dfd);
6997 return REDIS_ERR;
6998 }
6999 dumpsize -= nread;
7000 }
7001 close(dfd);
7002 if (rename(tmpfile,server.dbfilename) == -1) {
7003 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7004 unlink(tmpfile);
7005 close(fd);
7006 return REDIS_ERR;
7007 }
7008 emptyDb();
7009 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7010 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7011 close(fd);
7012 return REDIS_ERR;
7013 }
7014 server.master = createClient(fd);
7015 server.master->flags |= REDIS_MASTER;
7016 server.master->authenticated = 1;
7017 server.replstate = REDIS_REPL_CONNECTED;
7018 return REDIS_OK;
7019 }
7020
7021 static void slaveofCommand(redisClient *c) {
7022 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7023 !strcasecmp(c->argv[2]->ptr,"one")) {
7024 if (server.masterhost) {
7025 sdsfree(server.masterhost);
7026 server.masterhost = NULL;
7027 if (server.master) freeClient(server.master);
7028 server.replstate = REDIS_REPL_NONE;
7029 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7030 }
7031 } else {
7032 sdsfree(server.masterhost);
7033 server.masterhost = sdsdup(c->argv[1]->ptr);
7034 server.masterport = atoi(c->argv[2]->ptr);
7035 if (server.master) freeClient(server.master);
7036 server.replstate = REDIS_REPL_CONNECT;
7037 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7038 server.masterhost, server.masterport);
7039 }
7040 addReply(c,shared.ok);
7041 }
7042
7043 /* ============================ Maxmemory directive ======================== */
7044
7045 /* Try to free one object form the pre-allocated objects free list.
7046 * This is useful under low mem conditions as by default we take 1 million
7047 * free objects allocated. On success REDIS_OK is returned, otherwise
7048 * REDIS_ERR. */
7049 static int tryFreeOneObjectFromFreelist(void) {
7050 robj *o;
7051
7052 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7053 if (listLength(server.objfreelist)) {
7054 listNode *head = listFirst(server.objfreelist);
7055 o = listNodeValue(head);
7056 listDelNode(server.objfreelist,head);
7057 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7058 zfree(o);
7059 return REDIS_OK;
7060 } else {
7061 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7062 return REDIS_ERR;
7063 }
7064 }
7065
7066 /* This function gets called when 'maxmemory' is set on the config file to limit
7067 * the max memory used by the server, and we are out of memory.
7068 * This function will try to, in order:
7069 *
7070 * - Free objects from the free list
7071 * - Try to remove keys with an EXPIRE set
7072 *
7073 * It is not possible to free enough memory to reach used-memory < maxmemory
7074 * the server will start refusing commands that will enlarge even more the
7075 * memory usage.
7076 */
7077 static void freeMemoryIfNeeded(void) {
7078 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7079 int j, k, freed = 0;
7080
7081 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7082 for (j = 0; j < server.dbnum; j++) {
7083 int minttl = -1;
7084 robj *minkey = NULL;
7085 struct dictEntry *de;
7086
7087 if (dictSize(server.db[j].expires)) {
7088 freed = 1;
7089 /* From a sample of three keys drop the one nearest to
7090 * the natural expire */
7091 for (k = 0; k < 3; k++) {
7092 time_t t;
7093
7094 de = dictGetRandomKey(server.db[j].expires);
7095 t = (time_t) dictGetEntryVal(de);
7096 if (minttl == -1 || t < minttl) {
7097 minkey = dictGetEntryKey(de);
7098 minttl = t;
7099 }
7100 }
7101 deleteKey(server.db+j,minkey);
7102 }
7103 }
7104 if (!freed) return; /* nothing to free... */
7105 }
7106 }
7107
7108 /* ============================== Append Only file ========================== */
7109
7110 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7111 sds buf = sdsempty();
7112 int j;
7113 ssize_t nwritten;
7114 time_t now;
7115 robj *tmpargv[3];
7116
7117 /* The DB this command was targetting is not the same as the last command
7118 * we appendend. To issue a SELECT command is needed. */
7119 if (dictid != server.appendseldb) {
7120 char seldb[64];
7121
7122 snprintf(seldb,sizeof(seldb),"%d",dictid);
7123 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7124 (unsigned long)strlen(seldb),seldb);
7125 server.appendseldb = dictid;
7126 }
7127
7128 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7129 * EXPIREs into EXPIREATs calls */
7130 if (cmd->proc == expireCommand) {
7131 long when;
7132
7133 tmpargv[0] = createStringObject("EXPIREAT",8);
7134 tmpargv[1] = argv[1];
7135 incrRefCount(argv[1]);
7136 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7137 tmpargv[2] = createObject(REDIS_STRING,
7138 sdscatprintf(sdsempty(),"%ld",when));
7139 argv = tmpargv;
7140 }
7141
7142 /* Append the actual command */
7143 buf = sdscatprintf(buf,"*%d\r\n",argc);
7144 for (j = 0; j < argc; j++) {
7145 robj *o = argv[j];
7146
7147 o = getDecodedObject(o);
7148 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7149 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7150 buf = sdscatlen(buf,"\r\n",2);
7151 decrRefCount(o);
7152 }
7153
7154 /* Free the objects from the modified argv for EXPIREAT */
7155 if (cmd->proc == expireCommand) {
7156 for (j = 0; j < 3; j++)
7157 decrRefCount(argv[j]);
7158 }
7159
7160 /* We want to perform a single write. This should be guaranteed atomic
7161 * at least if the filesystem we are writing is a real physical one.
7162 * While this will save us against the server being killed I don't think
7163 * there is much to do about the whole server stopping for power problems
7164 * or alike */
7165 nwritten = write(server.appendfd,buf,sdslen(buf));
7166 if (nwritten != (signed)sdslen(buf)) {
7167 /* Ooops, we are in troubles. The best thing to do for now is
7168 * to simply exit instead to give the illusion that everything is
7169 * working as expected. */
7170 if (nwritten == -1) {
7171 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7172 } else {
7173 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7174 }
7175 exit(1);
7176 }
7177 /* If a background append only file rewriting is in progress we want to
7178 * accumulate the differences between the child DB and the current one
7179 * in a buffer, so that when the child process will do its work we
7180 * can append the differences to the new append only file. */
7181 if (server.bgrewritechildpid != -1)
7182 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7183
7184 sdsfree(buf);
7185 now = time(NULL);
7186 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7187 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7188 now-server.lastfsync > 1))
7189 {
7190 fsync(server.appendfd); /* Let's try to get this data on the disk */
7191 server.lastfsync = now;
7192 }
7193 }
7194
7195 /* In Redis commands are always executed in the context of a client, so in
7196 * order to load the append only file we need to create a fake client. */
7197 static struct redisClient *createFakeClient(void) {
7198 struct redisClient *c = zmalloc(sizeof(*c));
7199
7200 selectDb(c,0);
7201 c->fd = -1;
7202 c->querybuf = sdsempty();
7203 c->argc = 0;
7204 c->argv = NULL;
7205 c->flags = 0;
7206 /* We set the fake client as a slave waiting for the synchronization
7207 * so that Redis will not try to send replies to this client. */
7208 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7209 c->reply = listCreate();
7210 listSetFreeMethod(c->reply,decrRefCount);
7211 listSetDupMethod(c->reply,dupClientReplyValue);
7212 return c;
7213 }
7214
7215 static void freeFakeClient(struct redisClient *c) {
7216 sdsfree(c->querybuf);
7217 listRelease(c->reply);
7218 zfree(c);
7219 }
7220
7221 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7222 * error (the append only file is zero-length) REDIS_ERR is returned. On
7223 * fatal error an error message is logged and the program exists. */
7224 int loadAppendOnlyFile(char *filename) {
7225 struct redisClient *fakeClient;
7226 FILE *fp = fopen(filename,"r");
7227 struct redis_stat sb;
7228 unsigned long long loadedkeys = 0;
7229
7230 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7231 return REDIS_ERR;
7232
7233 if (fp == NULL) {
7234 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7235 exit(1);
7236 }
7237
7238 fakeClient = createFakeClient();
7239 while(1) {
7240 int argc, j;
7241 unsigned long len;
7242 robj **argv;
7243 char buf[128];
7244 sds argsds;
7245 struct redisCommand *cmd;
7246
7247 if (fgets(buf,sizeof(buf),fp) == NULL) {
7248 if (feof(fp))
7249 break;
7250 else
7251 goto readerr;
7252 }
7253 if (buf[0] != '*') goto fmterr;
7254 argc = atoi(buf+1);
7255 argv = zmalloc(sizeof(robj*)*argc);
7256 for (j = 0; j < argc; j++) {
7257 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7258 if (buf[0] != '$') goto fmterr;
7259 len = strtol(buf+1,NULL,10);
7260 argsds = sdsnewlen(NULL,len);
7261 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7262 argv[j] = createObject(REDIS_STRING,argsds);
7263 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7264 }
7265
7266 /* Command lookup */
7267 cmd = lookupCommand(argv[0]->ptr);
7268 if (!cmd) {
7269 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7270 exit(1);
7271 }
7272 /* Try object sharing and encoding */
7273 if (server.shareobjects) {
7274 int j;
7275 for(j = 1; j < argc; j++)
7276 argv[j] = tryObjectSharing(argv[j]);
7277 }
7278 if (cmd->flags & REDIS_CMD_BULK)
7279 tryObjectEncoding(argv[argc-1]);
7280 /* Run the command in the context of a fake client */
7281 fakeClient->argc = argc;
7282 fakeClient->argv = argv;
7283 cmd->proc(fakeClient);
7284 /* Discard the reply objects list from the fake client */
7285 while(listLength(fakeClient->reply))
7286 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7287 /* Clean up, ready for the next command */
7288 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7289 zfree(argv);
7290 /* Handle swapping while loading big datasets when VM is on */
7291 loadedkeys++;
7292 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7293 while (zmalloc_used_memory() > server.vm_max_memory) {
7294 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7295 }
7296 }
7297 }
7298 fclose(fp);
7299 freeFakeClient(fakeClient);
7300 return REDIS_OK;
7301
7302 readerr:
7303 if (feof(fp)) {
7304 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7305 } else {
7306 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7307 }
7308 exit(1);
7309 fmterr:
7310 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7311 exit(1);
7312 }
7313
7314 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7315 static int fwriteBulk(FILE *fp, robj *obj) {
7316 char buf[128];
7317 int decrrc = 0;
7318
7319 /* Avoid the incr/decr ref count business if possible to help
7320 * copy-on-write (we are often in a child process when this function
7321 * is called).
7322 * Also makes sure that key objects don't get incrRefCount-ed when VM
7323 * is enabled */
7324 if (obj->encoding != REDIS_ENCODING_RAW) {
7325 obj = getDecodedObject(obj);
7326 decrrc = 1;
7327 }
7328 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7329 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7330 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7331 goto err;
7332 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7333 if (decrrc) decrRefCount(obj);
7334 return 1;
7335 err:
7336 if (decrrc) decrRefCount(obj);
7337 return 0;
7338 }
7339
7340 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7341 static int fwriteBulkDouble(FILE *fp, double d) {
7342 char buf[128], dbuf[128];
7343
7344 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7345 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7346 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7347 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7348 return 1;
7349 }
7350
7351 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7352 static int fwriteBulkLong(FILE *fp, long l) {
7353 char buf[128], lbuf[128];
7354
7355 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7356 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7357 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7358 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7359 return 1;
7360 }
7361
7362 /* Write a sequence of commands able to fully rebuild the dataset into
7363 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7364 static int rewriteAppendOnlyFile(char *filename) {
7365 dictIterator *di = NULL;
7366 dictEntry *de;
7367 FILE *fp;
7368 char tmpfile[256];
7369 int j;
7370 time_t now = time(NULL);
7371
7372 /* Note that we have to use a different temp name here compared to the
7373 * one used by rewriteAppendOnlyFileBackground() function. */
7374 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7375 fp = fopen(tmpfile,"w");
7376 if (!fp) {
7377 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7378 return REDIS_ERR;
7379 }
7380 for (j = 0; j < server.dbnum; j++) {
7381 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7382 redisDb *db = server.db+j;
7383 dict *d = db->dict;
7384 if (dictSize(d) == 0) continue;
7385 di = dictGetIterator(d);
7386 if (!di) {
7387 fclose(fp);
7388 return REDIS_ERR;
7389 }
7390
7391 /* SELECT the new DB */
7392 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7393 if (fwriteBulkLong(fp,j) == 0) goto werr;
7394
7395 /* Iterate this DB writing every entry */
7396 while((de = dictNext(di)) != NULL) {
7397 robj *key, *o;
7398 time_t expiretime;
7399 int swapped;
7400
7401 key = dictGetEntryKey(de);
7402 /* If the value for this key is swapped, load a preview in memory.
7403 * We use a "swapped" flag to remember if we need to free the
7404 * value object instead to just increment the ref count anyway
7405 * in order to avoid copy-on-write of pages if we are forked() */
7406 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7407 key->storage == REDIS_VM_SWAPPING) {
7408 o = dictGetEntryVal(de);
7409 swapped = 0;
7410 } else {
7411 o = vmPreviewObject(key);
7412 swapped = 1;
7413 }
7414 expiretime = getExpire(db,key);
7415
7416 /* Save the key and associated value */
7417 if (o->type == REDIS_STRING) {
7418 /* Emit a SET command */
7419 char cmd[]="*3\r\n$3\r\nSET\r\n";
7420 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7421 /* Key and value */
7422 if (fwriteBulk(fp,key) == 0) goto werr;
7423 if (fwriteBulk(fp,o) == 0) goto werr;
7424 } else if (o->type == REDIS_LIST) {
7425 /* Emit the RPUSHes needed to rebuild the list */
7426 list *list = o->ptr;
7427 listNode *ln;
7428 listIter li;
7429
7430 listRewind(list,&li);
7431 while((ln = listNext(&li))) {
7432 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7433 robj *eleobj = listNodeValue(ln);
7434
7435 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7436 if (fwriteBulk(fp,key) == 0) goto werr;
7437 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7438 }
7439 } else if (o->type == REDIS_SET) {
7440 /* Emit the SADDs needed to rebuild the set */
7441 dict *set = o->ptr;
7442 dictIterator *di = dictGetIterator(set);
7443 dictEntry *de;
7444
7445 while((de = dictNext(di)) != NULL) {
7446 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7447 robj *eleobj = dictGetEntryKey(de);
7448
7449 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7450 if (fwriteBulk(fp,key) == 0) goto werr;
7451 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7452 }
7453 dictReleaseIterator(di);
7454 } else if (o->type == REDIS_ZSET) {
7455 /* Emit the ZADDs needed to rebuild the sorted set */
7456 zset *zs = o->ptr;
7457 dictIterator *di = dictGetIterator(zs->dict);
7458 dictEntry *de;
7459
7460 while((de = dictNext(di)) != NULL) {
7461 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7462 robj *eleobj = dictGetEntryKey(de);
7463 double *score = dictGetEntryVal(de);
7464
7465 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7466 if (fwriteBulk(fp,key) == 0) goto werr;
7467 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7468 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7469 }
7470 dictReleaseIterator(di);
7471 } else {
7472 redisAssert(0 != 0);
7473 }
7474 /* Save the expire time */
7475 if (expiretime != -1) {
7476 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7477 /* If this key is already expired skip it */
7478 if (expiretime < now) continue;
7479 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7480 if (fwriteBulk(fp,key) == 0) goto werr;
7481 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7482 }
7483 if (swapped) decrRefCount(o);
7484 }
7485 dictReleaseIterator(di);
7486 }
7487
7488 /* Make sure data will not remain on the OS's output buffers */
7489 fflush(fp);
7490 fsync(fileno(fp));
7491 fclose(fp);
7492
7493 /* Use RENAME to make sure the DB file is changed atomically only
7494 * if the generate DB file is ok. */
7495 if (rename(tmpfile,filename) == -1) {
7496 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7497 unlink(tmpfile);
7498 return REDIS_ERR;
7499 }
7500 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7501 return REDIS_OK;
7502
7503 werr:
7504 fclose(fp);
7505 unlink(tmpfile);
7506 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7507 if (di) dictReleaseIterator(di);
7508 return REDIS_ERR;
7509 }
7510
7511 /* This is how rewriting of the append only file in background works:
7512 *
7513 * 1) The user calls BGREWRITEAOF
7514 * 2) Redis calls this function, that forks():
7515 * 2a) the child rewrite the append only file in a temp file.
7516 * 2b) the parent accumulates differences in server.bgrewritebuf.
7517 * 3) When the child finished '2a' exists.
7518 * 4) The parent will trap the exit code, if it's OK, will append the
7519 * data accumulated into server.bgrewritebuf into the temp file, and
7520 * finally will rename(2) the temp file in the actual file name.
7521 * The the new file is reopened as the new append only file. Profit!
7522 */
7523 static int rewriteAppendOnlyFileBackground(void) {
7524 pid_t childpid;
7525
7526 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7527 if (server.vm_enabled) waitEmptyIOJobsQueue();
7528 if ((childpid = fork()) == 0) {
7529 /* Child */
7530 char tmpfile[256];
7531
7532 if (server.vm_enabled) vmReopenSwapFile();
7533 close(server.fd);
7534 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7535 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7536 _exit(0);
7537 } else {
7538 _exit(1);
7539 }
7540 } else {
7541 /* Parent */
7542 if (childpid == -1) {
7543 redisLog(REDIS_WARNING,
7544 "Can't rewrite append only file in background: fork: %s",
7545 strerror(errno));
7546 return REDIS_ERR;
7547 }
7548 redisLog(REDIS_NOTICE,
7549 "Background append only file rewriting started by pid %d",childpid);
7550 server.bgrewritechildpid = childpid;
7551 /* We set appendseldb to -1 in order to force the next call to the
7552 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7553 * accumulated by the parent into server.bgrewritebuf will start
7554 * with a SELECT statement and it will be safe to merge. */
7555 server.appendseldb = -1;
7556 return REDIS_OK;
7557 }
7558 return REDIS_OK; /* unreached */
7559 }
7560
7561 static void bgrewriteaofCommand(redisClient *c) {
7562 if (server.bgrewritechildpid != -1) {
7563 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7564 return;
7565 }
7566 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7567 char *status = "+Background append only file rewriting started\r\n";
7568 addReplySds(c,sdsnew(status));
7569 } else {
7570 addReply(c,shared.err);
7571 }
7572 }
7573
7574 static void aofRemoveTempFile(pid_t childpid) {
7575 char tmpfile[256];
7576
7577 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7578 unlink(tmpfile);
7579 }
7580
7581 /* Virtual Memory is composed mainly of two subsystems:
7582 * - Blocking Virutal Memory
7583 * - Threaded Virtual Memory I/O
7584 * The two parts are not fully decoupled, but functions are split among two
7585 * different sections of the source code (delimited by comments) in order to
7586 * make more clear what functionality is about the blocking VM and what about
7587 * the threaded (not blocking) VM.
7588 *
7589 * Redis VM design:
7590 *
7591 * Redis VM is a blocking VM (one that blocks reading swapped values from
7592 * disk into memory when a value swapped out is needed in memory) that is made
7593 * unblocking by trying to examine the command argument vector in order to
7594 * load in background values that will likely be needed in order to exec
7595 * the command. The command is executed only once all the relevant keys
7596 * are loaded into memory.
7597 *
7598 * This basically is almost as simple of a blocking VM, but almost as parallel
7599 * as a fully non-blocking VM.
7600 */
7601
7602 /* =================== Virtual Memory - Blocking Side ====================== */
7603
7604 /* substitute the first occurrence of '%p' with the process pid in the
7605 * swap file name. */
7606 static void expandVmSwapFilename(void) {
7607 char *p = strstr(server.vm_swap_file,"%p");
7608 sds new;
7609
7610 if (!p) return;
7611 new = sdsempty();
7612 *p = '\0';
7613 new = sdscat(new,server.vm_swap_file);
7614 new = sdscatprintf(new,"%ld",(long) getpid());
7615 new = sdscat(new,p+2);
7616 zfree(server.vm_swap_file);
7617 server.vm_swap_file = new;
7618 }
7619
7620 static void vmInit(void) {
7621 off_t totsize;
7622 int pipefds[2];
7623 size_t stacksize;
7624
7625 if (server.vm_max_threads != 0)
7626 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7627
7628 expandVmSwapFilename();
7629 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
7630 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
7631 server.vm_fp = fopen(server.vm_swap_file,"w+b");
7632 }
7633 if (server.vm_fp == NULL) {
7634 redisLog(REDIS_WARNING,
7635 "Impossible to open the swap file: %s. Exiting.",
7636 strerror(errno));
7637 exit(1);
7638 }
7639 server.vm_fd = fileno(server.vm_fp);
7640 server.vm_next_page = 0;
7641 server.vm_near_pages = 0;
7642 server.vm_stats_used_pages = 0;
7643 server.vm_stats_swapped_objects = 0;
7644 server.vm_stats_swapouts = 0;
7645 server.vm_stats_swapins = 0;
7646 totsize = server.vm_pages*server.vm_page_size;
7647 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
7648 if (ftruncate(server.vm_fd,totsize) == -1) {
7649 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
7650 strerror(errno));
7651 exit(1);
7652 } else {
7653 redisLog(REDIS_NOTICE,"Swap file allocated with success");
7654 }
7655 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
7656 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
7657 (long long) (server.vm_pages+7)/8, server.vm_pages);
7658 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
7659
7660 /* Initialize threaded I/O (used by Virtual Memory) */
7661 server.io_newjobs = listCreate();
7662 server.io_processing = listCreate();
7663 server.io_processed = listCreate();
7664 server.io_ready_clients = listCreate();
7665 pthread_mutex_init(&server.io_mutex,NULL);
7666 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
7667 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
7668 server.io_active_threads = 0;
7669 if (pipe(pipefds) == -1) {
7670 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
7671 ,strerror(errno));
7672 exit(1);
7673 }
7674 server.io_ready_pipe_read = pipefds[0];
7675 server.io_ready_pipe_write = pipefds[1];
7676 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
7677 /* LZF requires a lot of stack */
7678 pthread_attr_init(&server.io_threads_attr);
7679 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
7680 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
7681 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
7682 /* Listen for events in the threaded I/O pipe */
7683 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
7684 vmThreadedIOCompletedJob, NULL) == AE_ERR)
7685 oom("creating file event");
7686 }
7687
7688 /* Mark the page as used */
7689 static void vmMarkPageUsed(off_t page) {
7690 off_t byte = page/8;
7691 int bit = page&7;
7692 redisAssert(vmFreePage(page) == 1);
7693 server.vm_bitmap[byte] |= 1<<bit;
7694 }
7695
7696 /* Mark N contiguous pages as used, with 'page' being the first. */
7697 static void vmMarkPagesUsed(off_t page, off_t count) {
7698 off_t j;
7699
7700 for (j = 0; j < count; j++)
7701 vmMarkPageUsed(page+j);
7702 server.vm_stats_used_pages += count;
7703 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
7704 (long long)count, (long long)page);
7705 }
7706
7707 /* Mark the page as free */
7708 static void vmMarkPageFree(off_t page) {
7709 off_t byte = page/8;
7710 int bit = page&7;
7711 redisAssert(vmFreePage(page) == 0);
7712 server.vm_bitmap[byte] &= ~(1<<bit);
7713 }
7714
7715 /* Mark N contiguous pages as free, with 'page' being the first. */
7716 static void vmMarkPagesFree(off_t page, off_t count) {
7717 off_t j;
7718
7719 for (j = 0; j < count; j++)
7720 vmMarkPageFree(page+j);
7721 server.vm_stats_used_pages -= count;
7722 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
7723 (long long)count, (long long)page);
7724 }
7725
7726 /* Test if the page is free */
7727 static int vmFreePage(off_t page) {
7728 off_t byte = page/8;
7729 int bit = page&7;
7730 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
7731 }
7732
7733 /* Find N contiguous free pages storing the first page of the cluster in *first.
7734 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7735 * REDIS_ERR is returned.
7736 *
7737 * This function uses a simple algorithm: we try to allocate
7738 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7739 * again from the start of the swap file searching for free spaces.
7740 *
7741 * If it looks pretty clear that there are no free pages near our offset
7742 * we try to find less populated places doing a forward jump of
7743 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7744 * without hurry, and then we jump again and so forth...
7745 *
7746 * This function can be improved using a free list to avoid to guess
7747 * too much, since we could collect data about freed pages.
7748 *
7749 * note: I implemented this function just after watching an episode of
7750 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7751 */
7752 static int vmFindContiguousPages(off_t *first, off_t n) {
7753 off_t base, offset = 0, since_jump = 0, numfree = 0;
7754
7755 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7756 server.vm_near_pages = 0;
7757 server.vm_next_page = 0;
7758 }
7759 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7760 base = server.vm_next_page;
7761
7762 while(offset < server.vm_pages) {
7763 off_t this = base+offset;
7764
7765 /* If we overflow, restart from page zero */
7766 if (this >= server.vm_pages) {
7767 this -= server.vm_pages;
7768 if (this == 0) {
7769 /* Just overflowed, what we found on tail is no longer
7770 * interesting, as it's no longer contiguous. */
7771 numfree = 0;
7772 }
7773 }
7774 if (vmFreePage(this)) {
7775 /* This is a free page */
7776 numfree++;
7777 /* Already got N free pages? Return to the caller, with success */
7778 if (numfree == n) {
7779 *first = this-(n-1);
7780 server.vm_next_page = this+1;
7781 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
7782 return REDIS_OK;
7783 }
7784 } else {
7785 /* The current one is not a free page */
7786 numfree = 0;
7787 }
7788
7789 /* Fast-forward if the current page is not free and we already
7790 * searched enough near this place. */
7791 since_jump++;
7792 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7793 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7794 since_jump = 0;
7795 /* Note that even if we rewind after the jump, we are don't need
7796 * to make sure numfree is set to zero as we only jump *if* it
7797 * is set to zero. */
7798 } else {
7799 /* Otherwise just check the next page */
7800 offset++;
7801 }
7802 }
7803 return REDIS_ERR;
7804 }
7805
7806 /* Write the specified object at the specified page of the swap file */
7807 static int vmWriteObjectOnSwap(robj *o, off_t page) {
7808 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7809 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7810 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7811 redisLog(REDIS_WARNING,
7812 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
7813 strerror(errno));
7814 return REDIS_ERR;
7815 }
7816 rdbSaveObject(server.vm_fp,o);
7817 fflush(server.vm_fp);
7818 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7819 return REDIS_OK;
7820 }
7821
7822 /* Swap the 'val' object relative to 'key' into disk. Store all the information
7823 * needed to later retrieve the object into the key object.
7824 * If we can't find enough contiguous empty pages to swap the object on disk
7825 * REDIS_ERR is returned. */
7826 static int vmSwapObjectBlocking(robj *key, robj *val) {
7827 off_t pages = rdbSavedObjectPages(val,NULL);
7828 off_t page;
7829
7830 assert(key->storage == REDIS_VM_MEMORY);
7831 assert(key->refcount == 1);
7832 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
7833 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
7834 key->vm.page = page;
7835 key->vm.usedpages = pages;
7836 key->storage = REDIS_VM_SWAPPED;
7837 key->vtype = val->type;
7838 decrRefCount(val); /* Deallocate the object from memory. */
7839 vmMarkPagesUsed(page,pages);
7840 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7841 (unsigned char*) key->ptr,
7842 (unsigned long long) page, (unsigned long long) pages);
7843 server.vm_stats_swapped_objects++;
7844 server.vm_stats_swapouts++;
7845 return REDIS_OK;
7846 }
7847
7848 static robj *vmReadObjectFromSwap(off_t page, int type) {
7849 robj *o;
7850
7851 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7852 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7853 redisLog(REDIS_WARNING,
7854 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
7855 strerror(errno));
7856 _exit(1);
7857 }
7858 o = rdbLoadObject(type,server.vm_fp);
7859 if (o == NULL) {
7860 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
7861 _exit(1);
7862 }
7863 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7864 return o;
7865 }
7866
7867 /* Load the value object relative to the 'key' object from swap to memory.
7868 * The newly allocated object is returned.
7869 *
7870 * If preview is true the unserialized object is returned to the caller but
7871 * no changes are made to the key object, nor the pages are marked as freed */
7872 static robj *vmGenericLoadObject(robj *key, int preview) {
7873 robj *val;
7874
7875 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
7876 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7877 if (!preview) {
7878 key->storage = REDIS_VM_MEMORY;
7879 key->vm.atime = server.unixtime;
7880 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7881 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7882 (unsigned char*) key->ptr);
7883 server.vm_stats_swapped_objects--;
7884 } else {
7885 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7886 (unsigned char*) key->ptr);
7887 }
7888 server.vm_stats_swapins++;
7889 return val;
7890 }
7891
7892 /* Plain object loading, from swap to memory */
7893 static robj *vmLoadObject(robj *key) {
7894 /* If we are loading the object in background, stop it, we
7895 * need to load this object synchronously ASAP. */
7896 if (key->storage == REDIS_VM_LOADING)
7897 vmCancelThreadedIOJob(key);
7898 return vmGenericLoadObject(key,0);
7899 }
7900
7901 /* Just load the value on disk, without to modify the key.
7902 * This is useful when we want to perform some operation on the value
7903 * without to really bring it from swap to memory, like while saving the
7904 * dataset or rewriting the append only log. */
7905 static robj *vmPreviewObject(robj *key) {
7906 return vmGenericLoadObject(key,1);
7907 }
7908
7909 /* How a good candidate is this object for swapping?
7910 * The better candidate it is, the greater the returned value.
7911 *
7912 * Currently we try to perform a fast estimation of the object size in
7913 * memory, and combine it with aging informations.
7914 *
7915 * Basically swappability = idle-time * log(estimated size)
7916 *
7917 * Bigger objects are preferred over smaller objects, but not
7918 * proportionally, this is why we use the logarithm. This algorithm is
7919 * just a first try and will probably be tuned later. */
7920 static double computeObjectSwappability(robj *o) {
7921 time_t age = server.unixtime - o->vm.atime;
7922 long asize = 0;
7923 list *l;
7924 dict *d;
7925 struct dictEntry *de;
7926 int z;
7927
7928 if (age <= 0) return 0;
7929 switch(o->type) {
7930 case REDIS_STRING:
7931 if (o->encoding != REDIS_ENCODING_RAW) {
7932 asize = sizeof(*o);
7933 } else {
7934 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7935 }
7936 break;
7937 case REDIS_LIST:
7938 l = o->ptr;
7939 listNode *ln = listFirst(l);
7940
7941 asize = sizeof(list);
7942 if (ln) {
7943 robj *ele = ln->value;
7944 long elesize;
7945
7946 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7947 (sizeof(*o)+sdslen(ele->ptr)) :
7948 sizeof(*o);
7949 asize += (sizeof(listNode)+elesize)*listLength(l);
7950 }
7951 break;
7952 case REDIS_SET:
7953 case REDIS_ZSET:
7954 z = (o->type == REDIS_ZSET);
7955 d = z ? ((zset*)o->ptr)->dict : o->ptr;
7956
7957 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
7958 if (z) asize += sizeof(zset)-sizeof(dict);
7959 if (dictSize(d)) {
7960 long elesize;
7961 robj *ele;
7962
7963 de = dictGetRandomKey(d);
7964 ele = dictGetEntryKey(de);
7965 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7966 (sizeof(*o)+sdslen(ele->ptr)) :
7967 sizeof(*o);
7968 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
7969 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
7970 }
7971 break;
7972 }
7973 return (double)age*log(1+asize);
7974 }
7975
7976 /* Try to swap an object that's a good candidate for swapping.
7977 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
7978 * to swap any object at all.
7979 *
7980 * If 'usethreaded' is true, Redis will try to swap the object in background
7981 * using I/O threads. */
7982 static int vmSwapOneObject(int usethreads) {
7983 int j, i;
7984 struct dictEntry *best = NULL;
7985 double best_swappability = 0;
7986 redisDb *best_db = NULL;
7987 robj *key, *val;
7988
7989 for (j = 0; j < server.dbnum; j++) {
7990 redisDb *db = server.db+j;
7991 /* Why maxtries is set to 100?
7992 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7993 * are swappable objects */
7994 int maxtries = 100;
7995
7996 if (dictSize(db->dict) == 0) continue;
7997 for (i = 0; i < 5; i++) {
7998 dictEntry *de;
7999 double swappability;
8000
8001 if (maxtries) maxtries--;
8002 de = dictGetRandomKey(db->dict);
8003 key = dictGetEntryKey(de);
8004 val = dictGetEntryVal(de);
8005 /* Only swap objects that are currently in memory.
8006 *
8007 * Also don't swap shared objects if threaded VM is on, as we
8008 * try to ensure that the main thread does not touch the
8009 * object while the I/O thread is using it, but we can't
8010 * control other keys without adding additional mutex. */
8011 if (key->storage != REDIS_VM_MEMORY ||
8012 (server.vm_max_threads != 0 && val->refcount != 1)) {
8013 if (maxtries) i--; /* don't count this try */
8014 continue;
8015 }
8016 swappability = computeObjectSwappability(val);
8017 if (!best || swappability > best_swappability) {
8018 best = de;
8019 best_swappability = swappability;
8020 best_db = db;
8021 }
8022 }
8023 }
8024 if (best == NULL) return REDIS_ERR;
8025 key = dictGetEntryKey(best);
8026 val = dictGetEntryVal(best);
8027
8028 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8029 key->ptr, best_swappability);
8030
8031 /* Unshare the key if needed */
8032 if (key->refcount > 1) {
8033 robj *newkey = dupStringObject(key);
8034 decrRefCount(key);
8035 key = dictGetEntryKey(best) = newkey;
8036 }
8037 /* Swap it */
8038 if (usethreads) {
8039 vmSwapObjectThreaded(key,val,best_db);
8040 return REDIS_OK;
8041 } else {
8042 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8043 dictGetEntryVal(best) = NULL;
8044 return REDIS_OK;
8045 } else {
8046 return REDIS_ERR;
8047 }
8048 }
8049 }
8050
8051 static int vmSwapOneObjectBlocking() {
8052 return vmSwapOneObject(0);
8053 }
8054
8055 static int vmSwapOneObjectThreaded() {
8056 return vmSwapOneObject(1);
8057 }
8058
8059 /* Return true if it's safe to swap out objects in a given moment.
8060 * Basically we don't want to swap objects out while there is a BGSAVE
8061 * or a BGAEOREWRITE running in backgroud. */
8062 static int vmCanSwapOut(void) {
8063 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8064 }
8065
8066 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8067 * and was deleted. Otherwise 0 is returned. */
8068 static int deleteIfSwapped(redisDb *db, robj *key) {
8069 dictEntry *de;
8070 robj *foundkey;
8071
8072 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8073 foundkey = dictGetEntryKey(de);
8074 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8075 deleteKey(db,key);
8076 return 1;
8077 }
8078
8079 /* =================== Virtual Memory - Threaded I/O ======================= */
8080
8081 static void freeIOJob(iojob *j) {
8082 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8083 j->type == REDIS_IOJOB_DO_SWAP ||
8084 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8085 decrRefCount(j->val);
8086 decrRefCount(j->key);
8087 zfree(j);
8088 }
8089
8090 /* Every time a thread finished a Job, it writes a byte into the write side
8091 * of an unix pipe in order to "awake" the main thread, and this function
8092 * is called. */
8093 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8094 int mask)
8095 {
8096 char buf[1];
8097 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8098 REDIS_NOTUSED(el);
8099 REDIS_NOTUSED(mask);
8100 REDIS_NOTUSED(privdata);
8101
8102 /* For every byte we read in the read side of the pipe, there is one
8103 * I/O job completed to process. */
8104 while((retval = read(fd,buf,1)) == 1) {
8105 iojob *j;
8106 listNode *ln;
8107 robj *key;
8108 struct dictEntry *de;
8109
8110 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8111
8112 /* Get the processed element (the oldest one) */
8113 lockThreadedIO();
8114 assert(listLength(server.io_processed) != 0);
8115 if (toprocess == -1) {
8116 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8117 if (toprocess <= 0) toprocess = 1;
8118 }
8119 ln = listFirst(server.io_processed);
8120 j = ln->value;
8121 listDelNode(server.io_processed,ln);
8122 unlockThreadedIO();
8123 /* If this job is marked as canceled, just ignore it */
8124 if (j->canceled) {
8125 freeIOJob(j);
8126 continue;
8127 }
8128 /* Post process it in the main thread, as there are things we
8129 * can do just here to avoid race conditions and/or invasive locks */
8130 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8131 de = dictFind(j->db->dict,j->key);
8132 assert(de != NULL);
8133 key = dictGetEntryKey(de);
8134 if (j->type == REDIS_IOJOB_LOAD) {
8135 redisDb *db;
8136
8137 /* Key loaded, bring it at home */
8138 key->storage = REDIS_VM_MEMORY;
8139 key->vm.atime = server.unixtime;
8140 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8141 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8142 (unsigned char*) key->ptr);
8143 server.vm_stats_swapped_objects--;
8144 server.vm_stats_swapins++;
8145 dictGetEntryVal(de) = j->val;
8146 incrRefCount(j->val);
8147 db = j->db;
8148 freeIOJob(j);
8149 /* Handle clients waiting for this key to be loaded. */
8150 handleClientsBlockedOnSwappedKey(db,key);
8151 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8152 /* Now we know the amount of pages required to swap this object.
8153 * Let's find some space for it, and queue this task again
8154 * rebranded as REDIS_IOJOB_DO_SWAP. */
8155 if (!vmCanSwapOut() ||
8156 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8157 {
8158 /* Ooops... no space or we can't swap as there is
8159 * a fork()ed Redis trying to save stuff on disk. */
8160 freeIOJob(j);
8161 key->storage = REDIS_VM_MEMORY; /* undo operation */
8162 } else {
8163 /* Note that we need to mark this pages as used now,
8164 * if the job will be canceled, we'll mark them as freed
8165 * again. */
8166 vmMarkPagesUsed(j->page,j->pages);
8167 j->type = REDIS_IOJOB_DO_SWAP;
8168 lockThreadedIO();
8169 queueIOJob(j);
8170 unlockThreadedIO();
8171 }
8172 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8173 robj *val;
8174
8175 /* Key swapped. We can finally free some memory. */
8176 if (key->storage != REDIS_VM_SWAPPING) {
8177 printf("key->storage: %d\n",key->storage);
8178 printf("key->name: %s\n",(char*)key->ptr);
8179 printf("key->refcount: %d\n",key->refcount);
8180 printf("val: %p\n",(void*)j->val);
8181 printf("val->type: %d\n",j->val->type);
8182 printf("val->ptr: %s\n",(char*)j->val->ptr);
8183 }
8184 redisAssert(key->storage == REDIS_VM_SWAPPING);
8185 val = dictGetEntryVal(de);
8186 key->vm.page = j->page;
8187 key->vm.usedpages = j->pages;
8188 key->storage = REDIS_VM_SWAPPED;
8189 key->vtype = j->val->type;
8190 decrRefCount(val); /* Deallocate the object from memory. */
8191 dictGetEntryVal(de) = NULL;
8192 redisLog(REDIS_DEBUG,
8193 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8194 (unsigned char*) key->ptr,
8195 (unsigned long long) j->page, (unsigned long long) j->pages);
8196 server.vm_stats_swapped_objects++;
8197 server.vm_stats_swapouts++;
8198 freeIOJob(j);
8199 /* Put a few more swap requests in queue if we are still
8200 * out of memory */
8201 if (trytoswap && vmCanSwapOut() &&
8202 zmalloc_used_memory() > server.vm_max_memory)
8203 {
8204 int more = 1;
8205 while(more) {
8206 lockThreadedIO();
8207 more = listLength(server.io_newjobs) <
8208 (unsigned) server.vm_max_threads;
8209 unlockThreadedIO();
8210 /* Don't waste CPU time if swappable objects are rare. */
8211 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8212 trytoswap = 0;
8213 break;
8214 }
8215 }
8216 }
8217 }
8218 processed++;
8219 if (processed == toprocess) return;
8220 }
8221 if (retval < 0 && errno != EAGAIN) {
8222 redisLog(REDIS_WARNING,
8223 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8224 strerror(errno));
8225 }
8226 }
8227
8228 static void lockThreadedIO(void) {
8229 pthread_mutex_lock(&server.io_mutex);
8230 }
8231
8232 static void unlockThreadedIO(void) {
8233 pthread_mutex_unlock(&server.io_mutex);
8234 }
8235
8236 /* Remove the specified object from the threaded I/O queue if still not
8237 * processed, otherwise make sure to flag it as canceled. */
8238 static void vmCancelThreadedIOJob(robj *o) {
8239 list *lists[3] = {
8240 server.io_newjobs, /* 0 */
8241 server.io_processing, /* 1 */
8242 server.io_processed /* 2 */
8243 };
8244 int i;
8245
8246 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8247 again:
8248 lockThreadedIO();
8249 /* Search for a matching key in one of the queues */
8250 for (i = 0; i < 3; i++) {
8251 listNode *ln;
8252 listIter li;
8253
8254 listRewind(lists[i],&li);
8255 while ((ln = listNext(&li)) != NULL) {
8256 iojob *job = ln->value;
8257
8258 if (job->canceled) continue; /* Skip this, already canceled. */
8259 if (compareStringObjects(job->key,o) == 0) {
8260 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8261 (void*)job, (char*)o->ptr, job->type, i);
8262 /* Mark the pages as free since the swap didn't happened
8263 * or happened but is now discarded. */
8264 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8265 vmMarkPagesFree(job->page,job->pages);
8266 /* Cancel the job. It depends on the list the job is
8267 * living in. */
8268 switch(i) {
8269 case 0: /* io_newjobs */
8270 /* If the job was yet not processed the best thing to do
8271 * is to remove it from the queue at all */
8272 freeIOJob(job);
8273 listDelNode(lists[i],ln);
8274 break;
8275 case 1: /* io_processing */
8276 /* Oh Shi- the thread is messing with the Job:
8277 *
8278 * Probably it's accessing the object if this is a
8279 * PREPARE_SWAP or DO_SWAP job.
8280 * If it's a LOAD job it may be reading from disk and
8281 * if we don't wait for the job to terminate before to
8282 * cancel it, maybe in a few microseconds data can be
8283 * corrupted in this pages. So the short story is:
8284 *
8285 * Better to wait for the job to move into the
8286 * next queue (processed)... */
8287
8288 /* We try again and again until the job is completed. */
8289 unlockThreadedIO();
8290 /* But let's wait some time for the I/O thread
8291 * to finish with this job. After all this condition
8292 * should be very rare. */
8293 usleep(1);
8294 goto again;
8295 case 2: /* io_processed */
8296 /* The job was already processed, that's easy...
8297 * just mark it as canceled so that we'll ignore it
8298 * when processing completed jobs. */
8299 job->canceled = 1;
8300 break;
8301 }
8302 /* Finally we have to adjust the storage type of the object
8303 * in order to "UNDO" the operaiton. */
8304 if (o->storage == REDIS_VM_LOADING)
8305 o->storage = REDIS_VM_SWAPPED;
8306 else if (o->storage == REDIS_VM_SWAPPING)
8307 o->storage = REDIS_VM_MEMORY;
8308 unlockThreadedIO();
8309 return;
8310 }
8311 }
8312 }
8313 unlockThreadedIO();
8314 assert(1 != 1); /* We should never reach this */
8315 }
8316
8317 static void *IOThreadEntryPoint(void *arg) {
8318 iojob *j;
8319 listNode *ln;
8320 REDIS_NOTUSED(arg);
8321
8322 pthread_detach(pthread_self());
8323 while(1) {
8324 /* Get a new job to process */
8325 lockThreadedIO();
8326 if (listLength(server.io_newjobs) == 0) {
8327 /* No new jobs in queue, exit. */
8328 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8329 (long) pthread_self());
8330 server.io_active_threads--;
8331 unlockThreadedIO();
8332 return NULL;
8333 }
8334 ln = listFirst(server.io_newjobs);
8335 j = ln->value;
8336 listDelNode(server.io_newjobs,ln);
8337 /* Add the job in the processing queue */
8338 j->thread = pthread_self();
8339 listAddNodeTail(server.io_processing,j);
8340 ln = listLast(server.io_processing); /* We use ln later to remove it */
8341 unlockThreadedIO();
8342 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8343 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8344
8345 /* Process the Job */
8346 if (j->type == REDIS_IOJOB_LOAD) {
8347 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8348 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8349 FILE *fp = fopen("/dev/null","w+");
8350 j->pages = rdbSavedObjectPages(j->val,fp);
8351 fclose(fp);
8352 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8353 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8354 j->canceled = 1;
8355 }
8356
8357 /* Done: insert the job into the processed queue */
8358 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8359 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8360 lockThreadedIO();
8361 listDelNode(server.io_processing,ln);
8362 listAddNodeTail(server.io_processed,j);
8363 unlockThreadedIO();
8364
8365 /* Signal the main thread there is new stuff to process */
8366 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8367 }
8368 return NULL; /* never reached */
8369 }
8370
8371 static void spawnIOThread(void) {
8372 pthread_t thread;
8373 sigset_t mask, omask;
8374
8375 sigemptyset(&mask);
8376 sigaddset(&mask,SIGCHLD);
8377 sigaddset(&mask,SIGHUP);
8378 sigaddset(&mask,SIGPIPE);
8379 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8380 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
8381 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8382 server.io_active_threads++;
8383 }
8384
8385 /* We need to wait for the last thread to exit before we are able to
8386 * fork() in order to BGSAVE or BGREWRITEAOF. */
8387 static void waitEmptyIOJobsQueue(void) {
8388 while(1) {
8389 int io_processed_len;
8390
8391 lockThreadedIO();
8392 if (listLength(server.io_newjobs) == 0 &&
8393 listLength(server.io_processing) == 0 &&
8394 server.io_active_threads == 0)
8395 {
8396 unlockThreadedIO();
8397 return;
8398 }
8399 /* While waiting for empty jobs queue condition we post-process some
8400 * finshed job, as I/O threads may be hanging trying to write against
8401 * the io_ready_pipe_write FD but there are so much pending jobs that
8402 * it's blocking. */
8403 io_processed_len = listLength(server.io_processed);
8404 unlockThreadedIO();
8405 if (io_processed_len) {
8406 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8407 usleep(1000); /* 1 millisecond */
8408 } else {
8409 usleep(10000); /* 10 milliseconds */
8410 }
8411 }
8412 }
8413
8414 static void vmReopenSwapFile(void) {
8415 /* Note: we don't close the old one as we are in the child process
8416 * and don't want to mess at all with the original file object. */
8417 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8418 if (server.vm_fp == NULL) {
8419 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8420 server.vm_swap_file);
8421 _exit(1);
8422 }
8423 server.vm_fd = fileno(server.vm_fp);
8424 }
8425
8426 /* This function must be called while with threaded IO locked */
8427 static void queueIOJob(iojob *j) {
8428 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8429 (void*)j, j->type, (char*)j->key->ptr);
8430 listAddNodeTail(server.io_newjobs,j);
8431 if (server.io_active_threads < server.vm_max_threads)
8432 spawnIOThread();
8433 }
8434
8435 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8436 iojob *j;
8437
8438 assert(key->storage == REDIS_VM_MEMORY);
8439 assert(key->refcount == 1);
8440
8441 j = zmalloc(sizeof(*j));
8442 j->type = REDIS_IOJOB_PREPARE_SWAP;
8443 j->db = db;
8444 j->key = dupStringObject(key);
8445 j->val = val;
8446 incrRefCount(val);
8447 j->canceled = 0;
8448 j->thread = (pthread_t) -1;
8449 key->storage = REDIS_VM_SWAPPING;
8450
8451 lockThreadedIO();
8452 queueIOJob(j);
8453 unlockThreadedIO();
8454 return REDIS_OK;
8455 }
8456
8457 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8458
8459 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8460 * If there is not already a job loading the key, it is craeted.
8461 * The key is added to the io_keys list in the client structure, and also
8462 * in the hash table mapping swapped keys to waiting clients, that is,
8463 * server.io_waited_keys. */
8464 static int waitForSwappedKey(redisClient *c, robj *key) {
8465 struct dictEntry *de;
8466 robj *o;
8467 list *l;
8468
8469 /* If the key does not exist or is already in RAM we don't need to
8470 * block the client at all. */
8471 de = dictFind(c->db->dict,key);
8472 if (de == NULL) return 0;
8473 o = dictGetEntryKey(de);
8474 if (o->storage == REDIS_VM_MEMORY) {
8475 return 0;
8476 } else if (o->storage == REDIS_VM_SWAPPING) {
8477 /* We were swapping the key, undo it! */
8478 vmCancelThreadedIOJob(o);
8479 return 0;
8480 }
8481
8482 /* OK: the key is either swapped, or being loaded just now. */
8483
8484 /* Add the key to the list of keys this client is waiting for.
8485 * This maps clients to keys they are waiting for. */
8486 listAddNodeTail(c->io_keys,key);
8487 incrRefCount(key);
8488
8489 /* Add the client to the swapped keys => clients waiting map. */
8490 de = dictFind(c->db->io_keys,key);
8491 if (de == NULL) {
8492 int retval;
8493
8494 /* For every key we take a list of clients blocked for it */
8495 l = listCreate();
8496 retval = dictAdd(c->db->io_keys,key,l);
8497 incrRefCount(key);
8498 assert(retval == DICT_OK);
8499 } else {
8500 l = dictGetEntryVal(de);
8501 }
8502 listAddNodeTail(l,c);
8503
8504 /* Are we already loading the key from disk? If not create a job */
8505 if (o->storage == REDIS_VM_SWAPPED) {
8506 iojob *j;
8507
8508 o->storage = REDIS_VM_LOADING;
8509 j = zmalloc(sizeof(*j));
8510 j->type = REDIS_IOJOB_LOAD;
8511 j->db = c->db;
8512 j->key = dupStringObject(key);
8513 j->key->vtype = o->vtype;
8514 j->page = o->vm.page;
8515 j->val = NULL;
8516 j->canceled = 0;
8517 j->thread = (pthread_t) -1;
8518 lockThreadedIO();
8519 queueIOJob(j);
8520 unlockThreadedIO();
8521 }
8522 return 1;
8523 }
8524
8525 /* Is this client attempting to run a command against swapped keys?
8526 * If so, block it ASAP, load the keys in background, then resume it.
8527 *
8528 * The important idea about this function is that it can fail! If keys will
8529 * still be swapped when the client is resumed, this key lookups will
8530 * just block loading keys from disk. In practical terms this should only
8531 * happen with SORT BY command or if there is a bug in this function.
8532 *
8533 * Return 1 if the client is marked as blocked, 0 if the client can
8534 * continue as the keys it is going to access appear to be in memory. */
8535 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8536 int j, last;
8537
8538 if (cmd->vm_firstkey == 0) return 0;
8539 last = cmd->vm_lastkey;
8540 if (last < 0) last = c->argc+last;
8541 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8542 waitForSwappedKey(c,c->argv[j]);
8543 /* If the client was blocked for at least one key, mark it as blocked. */
8544 if (listLength(c->io_keys)) {
8545 c->flags |= REDIS_IO_WAIT;
8546 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8547 server.vm_blocked_clients++;
8548 return 1;
8549 } else {
8550 return 0;
8551 }
8552 }
8553
8554 /* Remove the 'key' from the list of blocked keys for a given client.
8555 *
8556 * The function returns 1 when there are no longer blocking keys after
8557 * the current one was removed (and the client can be unblocked). */
8558 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8559 list *l;
8560 listNode *ln;
8561 listIter li;
8562 struct dictEntry *de;
8563
8564 /* Remove the key from the list of keys this client is waiting for. */
8565 listRewind(c->io_keys,&li);
8566 while ((ln = listNext(&li)) != NULL) {
8567 if (compareStringObjects(ln->value,key) == 0) {
8568 listDelNode(c->io_keys,ln);
8569 break;
8570 }
8571 }
8572 assert(ln != NULL);
8573
8574 /* Remove the client form the key => waiting clients map. */
8575 de = dictFind(c->db->io_keys,key);
8576 assert(de != NULL);
8577 l = dictGetEntryVal(de);
8578 ln = listSearchKey(l,c);
8579 assert(ln != NULL);
8580 listDelNode(l,ln);
8581 if (listLength(l) == 0)
8582 dictDelete(c->db->io_keys,key);
8583
8584 return listLength(c->io_keys) == 0;
8585 }
8586
8587 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8588 struct dictEntry *de;
8589 list *l;
8590 listNode *ln;
8591 int len;
8592
8593 de = dictFind(db->io_keys,key);
8594 if (!de) return;
8595
8596 l = dictGetEntryVal(de);
8597 len = listLength(l);
8598 /* Note: we can't use something like while(listLength(l)) as the list
8599 * can be freed by the calling function when we remove the last element. */
8600 while (len--) {
8601 ln = listFirst(l);
8602 redisClient *c = ln->value;
8603
8604 if (dontWaitForSwappedKey(c,key)) {
8605 /* Put the client in the list of clients ready to go as we
8606 * loaded all the keys about it. */
8607 listAddNodeTail(server.io_ready_clients,c);
8608 }
8609 }
8610 }
8611
8612 /* ================================= Debugging ============================== */
8613
8614 static void debugCommand(redisClient *c) {
8615 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
8616 *((char*)-1) = 'x';
8617 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
8618 if (rdbSave(server.dbfilename) != REDIS_OK) {
8619 addReply(c,shared.err);
8620 return;
8621 }
8622 emptyDb();
8623 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8624 addReply(c,shared.err);
8625 return;
8626 }
8627 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
8628 addReply(c,shared.ok);
8629 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
8630 emptyDb();
8631 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
8632 addReply(c,shared.err);
8633 return;
8634 }
8635 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
8636 addReply(c,shared.ok);
8637 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
8638 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8639 robj *key, *val;
8640
8641 if (!de) {
8642 addReply(c,shared.nokeyerr);
8643 return;
8644 }
8645 key = dictGetEntryKey(de);
8646 val = dictGetEntryVal(de);
8647 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
8648 key->storage == REDIS_VM_SWAPPING)) {
8649 addReplySds(c,sdscatprintf(sdsempty(),
8650 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8651 "encoding:%d serializedlength:%lld\r\n",
8652 (void*)key, key->refcount, (void*)val, val->refcount,
8653 val->encoding, (long long) rdbSavedObjectLen(val,NULL)));
8654 } else {
8655 addReplySds(c,sdscatprintf(sdsempty(),
8656 "+Key at:%p refcount:%d, value swapped at: page %llu "
8657 "using %llu pages\r\n",
8658 (void*)key, key->refcount, (unsigned long long) key->vm.page,
8659 (unsigned long long) key->vm.usedpages));
8660 }
8661 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
8662 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8663 robj *key, *val;
8664
8665 if (!server.vm_enabled) {
8666 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8667 return;
8668 }
8669 if (!de) {
8670 addReply(c,shared.nokeyerr);
8671 return;
8672 }
8673 key = dictGetEntryKey(de);
8674 val = dictGetEntryVal(de);
8675 /* If the key is shared we want to create a copy */
8676 if (key->refcount > 1) {
8677 robj *newkey = dupStringObject(key);
8678 decrRefCount(key);
8679 key = dictGetEntryKey(de) = newkey;
8680 }
8681 /* Swap it */
8682 if (key->storage != REDIS_VM_MEMORY) {
8683 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
8684 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8685 dictGetEntryVal(de) = NULL;
8686 addReply(c,shared.ok);
8687 } else {
8688 addReply(c,shared.err);
8689 }
8690 } else {
8691 addReplySds(c,sdsnew(
8692 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
8693 }
8694 }
8695
8696 static void _redisAssert(char *estr, char *file, int line) {
8697 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
8698 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
8699 #ifdef HAVE_BACKTRACE
8700 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
8701 *((char*)-1) = 'x';
8702 #endif
8703 }
8704
8705 /* =================================== Main! ================================ */
8706
8707 #ifdef __linux__
8708 int linuxOvercommitMemoryValue(void) {
8709 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
8710 char buf[64];
8711
8712 if (!fp) return -1;
8713 if (fgets(buf,64,fp) == NULL) {
8714 fclose(fp);
8715 return -1;
8716 }
8717 fclose(fp);
8718
8719 return atoi(buf);
8720 }
8721
8722 void linuxOvercommitMemoryWarning(void) {
8723 if (linuxOvercommitMemoryValue() == 0) {
8724 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8725 }
8726 }
8727 #endif /* __linux__ */
8728
8729 static void daemonize(void) {
8730 int fd;
8731 FILE *fp;
8732
8733 if (fork() != 0) exit(0); /* parent exits */
8734 setsid(); /* create a new session */
8735
8736 /* Every output goes to /dev/null. If Redis is daemonized but
8737 * the 'logfile' is set to 'stdout' in the configuration file
8738 * it will not log at all. */
8739 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
8740 dup2(fd, STDIN_FILENO);
8741 dup2(fd, STDOUT_FILENO);
8742 dup2(fd, STDERR_FILENO);
8743 if (fd > STDERR_FILENO) close(fd);
8744 }
8745 /* Try to write the pid file */
8746 fp = fopen(server.pidfile,"w");
8747 if (fp) {
8748 fprintf(fp,"%d\n",getpid());
8749 fclose(fp);
8750 }
8751 }
8752
8753 int main(int argc, char **argv) {
8754 time_t start;
8755
8756 initServerConfig();
8757 if (argc == 2) {
8758 resetServerSaveParams();
8759 loadServerConfig(argv[1]);
8760 } else if (argc > 2) {
8761 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
8762 exit(1);
8763 } else {
8764 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8765 }
8766 if (server.daemonize) daemonize();
8767 initServer();
8768 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
8769 #ifdef __linux__
8770 linuxOvercommitMemoryWarning();
8771 #endif
8772 start = time(NULL);
8773 if (server.appendonly) {
8774 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
8775 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
8776 } else {
8777 if (rdbLoad(server.dbfilename) == REDIS_OK)
8778 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
8779 }
8780 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
8781 aeSetBeforeSleepProc(server.el,beforeSleep);
8782 aeMain(server.el);
8783 aeDeleteEventLoop(server.el);
8784 return 0;
8785 }
8786
8787 /* ============================= Backtrace support ========================= */
8788
8789 #ifdef HAVE_BACKTRACE
8790 static char *findFuncName(void *pointer, unsigned long *offset);
8791
8792 static void *getMcontextEip(ucontext_t *uc) {
8793 #if defined(__FreeBSD__)
8794 return (void*) uc->uc_mcontext.mc_eip;
8795 #elif defined(__dietlibc__)
8796 return (void*) uc->uc_mcontext.eip;
8797 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
8798 #if __x86_64__
8799 return (void*) uc->uc_mcontext->__ss.__rip;
8800 #else
8801 return (void*) uc->uc_mcontext->__ss.__eip;
8802 #endif
8803 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
8804 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
8805 return (void*) uc->uc_mcontext->__ss.__rip;
8806 #else
8807 return (void*) uc->uc_mcontext->__ss.__eip;
8808 #endif
8809 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8810 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
8811 #elif defined(__ia64__) /* Linux IA64 */
8812 return (void*) uc->uc_mcontext.sc_ip;
8813 #else
8814 return NULL;
8815 #endif
8816 }
8817
8818 static void segvHandler(int sig, siginfo_t *info, void *secret) {
8819 void *trace[100];
8820 char **messages = NULL;
8821 int i, trace_size = 0;
8822 unsigned long offset=0;
8823 ucontext_t *uc = (ucontext_t*) secret;
8824 sds infostring;
8825 REDIS_NOTUSED(info);
8826
8827 redisLog(REDIS_WARNING,
8828 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
8829 infostring = genRedisInfoString();
8830 redisLog(REDIS_WARNING, "%s",infostring);
8831 /* It's not safe to sdsfree() the returned string under memory
8832 * corruption conditions. Let it leak as we are going to abort */
8833
8834 trace_size = backtrace(trace, 100);
8835 /* overwrite sigaction with caller's address */
8836 if (getMcontextEip(uc) != NULL) {
8837 trace[1] = getMcontextEip(uc);
8838 }
8839 messages = backtrace_symbols(trace, trace_size);
8840
8841 for (i=1; i<trace_size; ++i) {
8842 char *fn = findFuncName(trace[i], &offset), *p;
8843
8844 p = strchr(messages[i],'+');
8845 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
8846 redisLog(REDIS_WARNING,"%s", messages[i]);
8847 } else {
8848 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
8849 }
8850 }
8851 /* free(messages); Don't call free() with possibly corrupted memory. */
8852 _exit(0);
8853 }
8854
8855 static void setupSigSegvAction(void) {
8856 struct sigaction act;
8857
8858 sigemptyset (&act.sa_mask);
8859 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8860 * is used. Otherwise, sa_handler is used */
8861 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
8862 act.sa_sigaction = segvHandler;
8863 sigaction (SIGSEGV, &act, NULL);
8864 sigaction (SIGBUS, &act, NULL);
8865 sigaction (SIGFPE, &act, NULL);
8866 sigaction (SIGILL, &act, NULL);
8867 sigaction (SIGBUS, &act, NULL);
8868 return;
8869 }
8870
8871 #include "staticsymbols.h"
8872 /* This function try to convert a pointer into a function name. It's used in
8873 * oreder to provide a backtrace under segmentation fault that's able to
8874 * display functions declared as static (otherwise the backtrace is useless). */
8875 static char *findFuncName(void *pointer, unsigned long *offset){
8876 int i, ret = -1;
8877 unsigned long off, minoff = 0;
8878
8879 /* Try to match against the Symbol with the smallest offset */
8880 for (i=0; symsTable[i].pointer; i++) {
8881 unsigned long lp = (unsigned long) pointer;
8882
8883 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
8884 off=lp-symsTable[i].pointer;
8885 if (ret < 0 || off < minoff) {
8886 minoff=off;
8887 ret=i;
8888 }
8889 }
8890 }
8891 if (ret == -1) return NULL;
8892 *offset = minoff;
8893 return symsTable[ret].name;
8894 }
8895 #else /* HAVE_BACKTRACE */
8896 static void setupSigSegvAction(void) {
8897 }
8898 #endif /* HAVE_BACKTRACE */
8899
8900
8901
8902 /* The End */
8903
8904
8905