]> git.saurik.com Git - redis.git/blob - redis.c
replaced ZMERGE by ZUNION and ZINTER. note: key preloading by the VM does not yet...
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.4"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 /* Object types only used for dumping to disk */
131 #define REDIS_EXPIRETIME 253
132 #define REDIS_SELECTDB 254
133 #define REDIS_EOF 255
134
135 /* Defines related to the dump file format. To store 32 bits lengths for short
136 * keys requires a lot of space, so we check the most significant 2 bits of
137 * the first byte to interpreter the length:
138 *
139 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
140 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
141 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
142 * 11|000000 this means: specially encoded object will follow. The six bits
143 * number specify the kind of object that follows.
144 * See the REDIS_RDB_ENC_* defines.
145 *
146 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
147 * values, will fit inside. */
148 #define REDIS_RDB_6BITLEN 0
149 #define REDIS_RDB_14BITLEN 1
150 #define REDIS_RDB_32BITLEN 2
151 #define REDIS_RDB_ENCVAL 3
152 #define REDIS_RDB_LENERR UINT_MAX
153
154 /* When a length of a string object stored on disk has the first two bits
155 * set, the remaining two bits specify a special encoding for the object
156 * accordingly to the following defines: */
157 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
158 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
159 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
160 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
161
162 /* Virtual memory object->where field. */
163 #define REDIS_VM_MEMORY 0 /* The object is on memory */
164 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
165 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
166 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
167
168 /* Virtual memory static configuration stuff.
169 * Check vmFindContiguousPages() to know more about this magic numbers. */
170 #define REDIS_VM_MAX_NEAR_PAGES 65536
171 #define REDIS_VM_MAX_RANDOM_JUMP 4096
172 #define REDIS_VM_MAX_THREADS 32
173 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
174 /* The following is the *percentage* of completed I/O jobs to process when the
175 * handelr is called. While Virtual Memory I/O operations are performed by
176 * threads, this operations must be processed by the main thread when completed
177 * in order to take effect. */
178 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
179
180 /* Client flags */
181 #define REDIS_SLAVE 1 /* This client is a slave server */
182 #define REDIS_MASTER 2 /* This client is a master server */
183 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
184 #define REDIS_MULTI 8 /* This client is in a MULTI context */
185 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
186 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
187
188 /* Slave replication state - slave side */
189 #define REDIS_REPL_NONE 0 /* No active replication */
190 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
191 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
192
193 /* Slave replication state - from the point of view of master
194 * Note that in SEND_BULK and ONLINE state the slave receives new updates
195 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
196 * to start the next background saving in order to send updates to it. */
197 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
198 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
199 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
200 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
201
202 /* List related stuff */
203 #define REDIS_HEAD 0
204 #define REDIS_TAIL 1
205
206 /* Sort operations */
207 #define REDIS_SORT_GET 0
208 #define REDIS_SORT_ASC 1
209 #define REDIS_SORT_DESC 2
210 #define REDIS_SORTKEY_MAX 1024
211
212 /* Log levels */
213 #define REDIS_DEBUG 0
214 #define REDIS_VERBOSE 1
215 #define REDIS_NOTICE 2
216 #define REDIS_WARNING 3
217
218 /* Anti-warning macro... */
219 #define REDIS_NOTUSED(V) ((void) V)
220
221 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
222 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
223
224 /* Append only defines */
225 #define APPENDFSYNC_NO 0
226 #define APPENDFSYNC_ALWAYS 1
227 #define APPENDFSYNC_EVERYSEC 2
228
229 /* Hashes related defaults */
230 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
231 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
232
233 /* We can print the stacktrace, so our assert is defined this way: */
234 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
235 static void _redisAssert(char *estr, char *file, int line);
236
237 /*================================= Data types ============================== */
238
239 /* A redis object, that is a type able to hold a string / list / set */
240
241 /* The VM object structure */
242 struct redisObjectVM {
243 off_t page; /* the page at witch the object is stored on disk */
244 off_t usedpages; /* number of pages used on disk */
245 time_t atime; /* Last access time */
246 } vm;
247
248 /* The actual Redis Object */
249 typedef struct redisObject {
250 void *ptr;
251 unsigned char type;
252 unsigned char encoding;
253 unsigned char storage; /* If this object is a key, where is the value?
254 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
255 unsigned char vtype; /* If this object is a key, and value is swapped out,
256 * this is the type of the swapped out object. */
257 int refcount;
258 /* VM fields, this are only allocated if VM is active, otherwise the
259 * object allocation function will just allocate
260 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
261 * Redis without VM active will not have any overhead. */
262 struct redisObjectVM vm;
263 } robj;
264
265 /* Macro used to initalize a Redis object allocated on the stack.
266 * Note that this macro is taken near the structure definition to make sure
267 * we'll update it when the structure is changed, to avoid bugs like
268 * bug #85 introduced exactly in this way. */
269 #define initStaticStringObject(_var,_ptr) do { \
270 _var.refcount = 1; \
271 _var.type = REDIS_STRING; \
272 _var.encoding = REDIS_ENCODING_RAW; \
273 _var.ptr = _ptr; \
274 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
275 } while(0);
276
277 typedef struct redisDb {
278 dict *dict; /* The keyspace for this DB */
279 dict *expires; /* Timeout of keys with a timeout set */
280 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
281 dict *io_keys; /* Keys with clients waiting for VM I/O */
282 int id;
283 } redisDb;
284
285 /* Client MULTI/EXEC state */
286 typedef struct multiCmd {
287 robj **argv;
288 int argc;
289 struct redisCommand *cmd;
290 } multiCmd;
291
292 typedef struct multiState {
293 multiCmd *commands; /* Array of MULTI commands */
294 int count; /* Total number of MULTI commands */
295 } multiState;
296
297 /* With multiplexing we need to take per-clinet state.
298 * Clients are taken in a liked list. */
299 typedef struct redisClient {
300 int fd;
301 redisDb *db;
302 int dictid;
303 sds querybuf;
304 robj **argv, **mbargv;
305 int argc, mbargc;
306 int bulklen; /* bulk read len. -1 if not in bulk read mode */
307 int multibulk; /* multi bulk command format active */
308 list *reply;
309 int sentlen;
310 time_t lastinteraction; /* time of the last interaction, used for timeout */
311 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
312 int slaveseldb; /* slave selected db, if this client is a slave */
313 int authenticated; /* when requirepass is non-NULL */
314 int replstate; /* replication state if this is a slave */
315 int repldbfd; /* replication DB file descriptor */
316 long repldboff; /* replication DB file offset */
317 off_t repldbsize; /* replication DB file size */
318 multiState mstate; /* MULTI/EXEC state */
319 robj **blockingkeys; /* The key we are waiting to terminate a blocking
320 * operation such as BLPOP. Otherwise NULL. */
321 int blockingkeysnum; /* Number of blocking keys */
322 time_t blockingto; /* Blocking operation timeout. If UNIX current time
323 * is >= blockingto then the operation timed out. */
324 list *io_keys; /* Keys this client is waiting to be loaded from the
325 * swap file in order to continue. */
326 } redisClient;
327
328 struct saveparam {
329 time_t seconds;
330 int changes;
331 };
332
333 /* Global server state structure */
334 struct redisServer {
335 int port;
336 int fd;
337 redisDb *db;
338 dict *sharingpool; /* Poll used for object sharing */
339 unsigned int sharingpoolsize;
340 long long dirty; /* changes to DB from the last save */
341 list *clients;
342 list *slaves, *monitors;
343 char neterr[ANET_ERR_LEN];
344 aeEventLoop *el;
345 int cronloops; /* number of times the cron function run */
346 list *objfreelist; /* A list of freed objects to avoid malloc() */
347 time_t lastsave; /* Unix time of last save succeeede */
348 /* Fields used only for stats */
349 time_t stat_starttime; /* server start time */
350 long long stat_numcommands; /* number of processed commands */
351 long long stat_numconnections; /* number of connections received */
352 /* Configuration */
353 int verbosity;
354 int glueoutputbuf;
355 int maxidletime;
356 int dbnum;
357 int daemonize;
358 int appendonly;
359 int appendfsync;
360 time_t lastfsync;
361 int appendfd;
362 int appendseldb;
363 char *pidfile;
364 pid_t bgsavechildpid;
365 pid_t bgrewritechildpid;
366 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
367 struct saveparam *saveparams;
368 int saveparamslen;
369 char *logfile;
370 char *bindaddr;
371 char *dbfilename;
372 char *appendfilename;
373 char *requirepass;
374 int shareobjects;
375 int rdbcompression;
376 /* Replication related */
377 int isslave;
378 char *masterauth;
379 char *masterhost;
380 int masterport;
381 redisClient *master; /* client that is master for this slave */
382 int replstate;
383 unsigned int maxclients;
384 unsigned long long maxmemory;
385 unsigned int blpop_blocked_clients;
386 unsigned int vm_blocked_clients;
387 /* Sort parameters - qsort_r() is only available under BSD so we
388 * have to take this state global, in order to pass it to sortCompare() */
389 int sort_desc;
390 int sort_alpha;
391 int sort_bypattern;
392 /* Virtual memory configuration */
393 int vm_enabled;
394 char *vm_swap_file;
395 off_t vm_page_size;
396 off_t vm_pages;
397 unsigned long long vm_max_memory;
398 /* Hashes config */
399 size_t hash_max_zipmap_entries;
400 size_t hash_max_zipmap_value;
401 /* Virtual memory state */
402 FILE *vm_fp;
403 int vm_fd;
404 off_t vm_next_page; /* Next probably empty page */
405 off_t vm_near_pages; /* Number of pages allocated sequentially */
406 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
407 time_t unixtime; /* Unix time sampled every second. */
408 /* Virtual memory I/O threads stuff */
409 /* An I/O thread process an element taken from the io_jobs queue and
410 * put the result of the operation in the io_done list. While the
411 * job is being processed, it's put on io_processing queue. */
412 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
413 list *io_processing; /* List of VM I/O jobs being processed */
414 list *io_processed; /* List of VM I/O jobs already processed */
415 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
416 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
417 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
418 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
419 pthread_attr_t io_threads_attr; /* attributes for threads creation */
420 int io_active_threads; /* Number of running I/O threads */
421 int vm_max_threads; /* Max number of I/O threads running at the same time */
422 /* Our main thread is blocked on the event loop, locking for sockets ready
423 * to be read or written, so when a threaded I/O operation is ready to be
424 * processed by the main thread, the I/O thread will use a unix pipe to
425 * awake the main thread. The followings are the two pipe FDs. */
426 int io_ready_pipe_read;
427 int io_ready_pipe_write;
428 /* Virtual memory stats */
429 unsigned long long vm_stats_used_pages;
430 unsigned long long vm_stats_swapped_objects;
431 unsigned long long vm_stats_swapouts;
432 unsigned long long vm_stats_swapins;
433 FILE *devnull;
434 };
435
436 typedef void redisCommandProc(redisClient *c);
437 struct redisCommand {
438 char *name;
439 redisCommandProc *proc;
440 int arity;
441 int flags;
442 /* What keys should be loaded in background when calling this command? */
443 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
444 int vm_lastkey; /* THe last argument that's a key */
445 int vm_keystep; /* The step between first and last key */
446 };
447
448 struct redisFunctionSym {
449 char *name;
450 unsigned long pointer;
451 };
452
453 typedef struct _redisSortObject {
454 robj *obj;
455 union {
456 double score;
457 robj *cmpobj;
458 } u;
459 } redisSortObject;
460
461 typedef struct _redisSortOperation {
462 int type;
463 robj *pattern;
464 } redisSortOperation;
465
466 /* ZSETs use a specialized version of Skiplists */
467
468 typedef struct zskiplistNode {
469 struct zskiplistNode **forward;
470 struct zskiplistNode *backward;
471 unsigned int *span;
472 double score;
473 robj *obj;
474 } zskiplistNode;
475
476 typedef struct zskiplist {
477 struct zskiplistNode *header, *tail;
478 unsigned long length;
479 int level;
480 } zskiplist;
481
482 typedef struct zset {
483 dict *dict;
484 zskiplist *zsl;
485 } zset;
486
487 /* Our shared "common" objects */
488
489 struct sharedObjectsStruct {
490 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
491 *colon, *nullbulk, *nullmultibulk, *queued,
492 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
493 *outofrangeerr, *plus,
494 *select0, *select1, *select2, *select3, *select4,
495 *select5, *select6, *select7, *select8, *select9;
496 } shared;
497
498 /* Global vars that are actally used as constants. The following double
499 * values are used for double on-disk serialization, and are initialized
500 * at runtime to avoid strange compiler optimizations. */
501
502 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
503
504 /* VM threaded I/O request message */
505 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
506 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
507 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
508 typedef struct iojob {
509 int type; /* Request type, REDIS_IOJOB_* */
510 redisDb *db;/* Redis database */
511 robj *key; /* This I/O request is about swapping this key */
512 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
513 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
514 off_t page; /* Swap page where to read/write the object */
515 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
516 int canceled; /* True if this command was canceled by blocking side of VM */
517 pthread_t thread; /* ID of the thread processing this entry */
518 } iojob;
519
520 /*================================ Prototypes =============================== */
521
522 static void freeStringObject(robj *o);
523 static void freeListObject(robj *o);
524 static void freeSetObject(robj *o);
525 static void decrRefCount(void *o);
526 static robj *createObject(int type, void *ptr);
527 static void freeClient(redisClient *c);
528 static int rdbLoad(char *filename);
529 static void addReply(redisClient *c, robj *obj);
530 static void addReplySds(redisClient *c, sds s);
531 static void incrRefCount(robj *o);
532 static int rdbSaveBackground(char *filename);
533 static robj *createStringObject(char *ptr, size_t len);
534 static robj *dupStringObject(robj *o);
535 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
536 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
537 static int syncWithMaster(void);
538 static robj *tryObjectSharing(robj *o);
539 static int tryObjectEncoding(robj *o);
540 static robj *getDecodedObject(robj *o);
541 static int removeExpire(redisDb *db, robj *key);
542 static int expireIfNeeded(redisDb *db, robj *key);
543 static int deleteIfVolatile(redisDb *db, robj *key);
544 static int deleteIfSwapped(redisDb *db, robj *key);
545 static int deleteKey(redisDb *db, robj *key);
546 static time_t getExpire(redisDb *db, robj *key);
547 static int setExpire(redisDb *db, robj *key, time_t when);
548 static void updateSlavesWaitingBgsave(int bgsaveerr);
549 static void freeMemoryIfNeeded(void);
550 static int processCommand(redisClient *c);
551 static void setupSigSegvAction(void);
552 static void rdbRemoveTempFile(pid_t childpid);
553 static void aofRemoveTempFile(pid_t childpid);
554 static size_t stringObjectLen(robj *o);
555 static void processInputBuffer(redisClient *c);
556 static zskiplist *zslCreate(void);
557 static void zslFree(zskiplist *zsl);
558 static void zslInsert(zskiplist *zsl, double score, robj *obj);
559 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
560 static void initClientMultiState(redisClient *c);
561 static void freeClientMultiState(redisClient *c);
562 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
563 static void unblockClientWaitingData(redisClient *c);
564 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
565 static void vmInit(void);
566 static void vmMarkPagesFree(off_t page, off_t count);
567 static robj *vmLoadObject(robj *key);
568 static robj *vmPreviewObject(robj *key);
569 static int vmSwapOneObjectBlocking(void);
570 static int vmSwapOneObjectThreaded(void);
571 static int vmCanSwapOut(void);
572 static int tryFreeOneObjectFromFreelist(void);
573 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
574 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
575 static void vmCancelThreadedIOJob(robj *o);
576 static void lockThreadedIO(void);
577 static void unlockThreadedIO(void);
578 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
579 static void freeIOJob(iojob *j);
580 static void queueIOJob(iojob *j);
581 static int vmWriteObjectOnSwap(robj *o, off_t page);
582 static robj *vmReadObjectFromSwap(off_t page, int type);
583 static void waitEmptyIOJobsQueue(void);
584 static void vmReopenSwapFile(void);
585 static int vmFreePage(off_t page);
586 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
587 static int dontWaitForSwappedKey(redisClient *c, robj *key);
588 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
589 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
590 static struct redisCommand *lookupCommand(char *name);
591 static void call(redisClient *c, struct redisCommand *cmd);
592 static void resetClient(redisClient *c);
593
594 static void authCommand(redisClient *c);
595 static void pingCommand(redisClient *c);
596 static void echoCommand(redisClient *c);
597 static void setCommand(redisClient *c);
598 static void setnxCommand(redisClient *c);
599 static void getCommand(redisClient *c);
600 static void delCommand(redisClient *c);
601 static void existsCommand(redisClient *c);
602 static void incrCommand(redisClient *c);
603 static void decrCommand(redisClient *c);
604 static void incrbyCommand(redisClient *c);
605 static void decrbyCommand(redisClient *c);
606 static void selectCommand(redisClient *c);
607 static void randomkeyCommand(redisClient *c);
608 static void keysCommand(redisClient *c);
609 static void dbsizeCommand(redisClient *c);
610 static void lastsaveCommand(redisClient *c);
611 static void saveCommand(redisClient *c);
612 static void bgsaveCommand(redisClient *c);
613 static void bgrewriteaofCommand(redisClient *c);
614 static void shutdownCommand(redisClient *c);
615 static void moveCommand(redisClient *c);
616 static void renameCommand(redisClient *c);
617 static void renamenxCommand(redisClient *c);
618 static void lpushCommand(redisClient *c);
619 static void rpushCommand(redisClient *c);
620 static void lpopCommand(redisClient *c);
621 static void rpopCommand(redisClient *c);
622 static void llenCommand(redisClient *c);
623 static void lindexCommand(redisClient *c);
624 static void lrangeCommand(redisClient *c);
625 static void ltrimCommand(redisClient *c);
626 static void typeCommand(redisClient *c);
627 static void lsetCommand(redisClient *c);
628 static void saddCommand(redisClient *c);
629 static void sremCommand(redisClient *c);
630 static void smoveCommand(redisClient *c);
631 static void sismemberCommand(redisClient *c);
632 static void scardCommand(redisClient *c);
633 static void spopCommand(redisClient *c);
634 static void srandmemberCommand(redisClient *c);
635 static void sinterCommand(redisClient *c);
636 static void sinterstoreCommand(redisClient *c);
637 static void sunionCommand(redisClient *c);
638 static void sunionstoreCommand(redisClient *c);
639 static void sdiffCommand(redisClient *c);
640 static void sdiffstoreCommand(redisClient *c);
641 static void syncCommand(redisClient *c);
642 static void flushdbCommand(redisClient *c);
643 static void flushallCommand(redisClient *c);
644 static void sortCommand(redisClient *c);
645 static void lremCommand(redisClient *c);
646 static void rpoplpushcommand(redisClient *c);
647 static void infoCommand(redisClient *c);
648 static void mgetCommand(redisClient *c);
649 static void monitorCommand(redisClient *c);
650 static void expireCommand(redisClient *c);
651 static void expireatCommand(redisClient *c);
652 static void getsetCommand(redisClient *c);
653 static void ttlCommand(redisClient *c);
654 static void slaveofCommand(redisClient *c);
655 static void debugCommand(redisClient *c);
656 static void msetCommand(redisClient *c);
657 static void msetnxCommand(redisClient *c);
658 static void zaddCommand(redisClient *c);
659 static void zincrbyCommand(redisClient *c);
660 static void zrangeCommand(redisClient *c);
661 static void zrangebyscoreCommand(redisClient *c);
662 static void zcountCommand(redisClient *c);
663 static void zrevrangeCommand(redisClient *c);
664 static void zcardCommand(redisClient *c);
665 static void zremCommand(redisClient *c);
666 static void zscoreCommand(redisClient *c);
667 static void zremrangebyscoreCommand(redisClient *c);
668 static void multiCommand(redisClient *c);
669 static void execCommand(redisClient *c);
670 static void discardCommand(redisClient *c);
671 static void blpopCommand(redisClient *c);
672 static void brpopCommand(redisClient *c);
673 static void appendCommand(redisClient *c);
674 static void substrCommand(redisClient *c);
675 static void zrankCommand(redisClient *c);
676 static void hsetCommand(redisClient *c);
677 static void hgetCommand(redisClient *c);
678 static void zunionCommand(redisClient *c);
679 static void zinterCommand(redisClient *c);
680
681 /*================================= Globals ================================= */
682
683 /* Global vars */
684 static struct redisServer server; /* server global state */
685 static struct redisCommand cmdTable[] = {
686 {"get",getCommand,2,REDIS_CMD_INLINE,1,1,1},
687 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
688 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
689 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
690 {"substr",substrCommand,4,REDIS_CMD_INLINE,1,1,1},
691 {"del",delCommand,-2,REDIS_CMD_INLINE,0,0,0},
692 {"exists",existsCommand,2,REDIS_CMD_INLINE,1,1,1},
693 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
694 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
695 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,1,-1,1},
696 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
697 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
698 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,1,1,1},
699 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,1,1,1},
700 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
701 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
702 {"llen",llenCommand,2,REDIS_CMD_INLINE,1,1,1},
703 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,1,1,1},
704 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
705 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,1,1,1},
706 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,1,1,1},
707 {"lrem",lremCommand,4,REDIS_CMD_BULK,1,1,1},
708 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,2,1},
709 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
710 {"srem",sremCommand,3,REDIS_CMD_BULK,1,1,1},
711 {"smove",smoveCommand,4,REDIS_CMD_BULK,1,2,1},
712 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,1,1,1},
713 {"scard",scardCommand,2,REDIS_CMD_INLINE,1,1,1},
714 {"spop",spopCommand,2,REDIS_CMD_INLINE,1,1,1},
715 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,1,1,1},
716 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
717 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
718 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
719 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
720 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
721 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
722 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,1,1,1},
723 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
724 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
725 {"zrem",zremCommand,3,REDIS_CMD_BULK,1,1,1},
726 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,1,1,1},
727 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,0,0,0},
728 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,0,0,0},
729 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
730 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,1,1,1},
731 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,1,1,1},
732 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
733 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,1,1,1},
734 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
735 {"zrank",zrankCommand,3,REDIS_CMD_INLINE,1,1,1},
736 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
737 {"hget",hgetCommand,3,REDIS_CMD_BULK,1,1,1},
738 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
739 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
740 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
741 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
742 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
743 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,0,0,0},
744 {"select",selectCommand,2,REDIS_CMD_INLINE,0,0,0},
745 {"move",moveCommand,3,REDIS_CMD_INLINE,1,1,1},
746 {"rename",renameCommand,3,REDIS_CMD_INLINE,1,1,1},
747 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,1,1,1},
748 {"expire",expireCommand,3,REDIS_CMD_INLINE,0,0,0},
749 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,0,0,0},
750 {"keys",keysCommand,2,REDIS_CMD_INLINE,0,0,0},
751 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,0,0,0},
752 {"auth",authCommand,2,REDIS_CMD_INLINE,0,0,0},
753 {"ping",pingCommand,1,REDIS_CMD_INLINE,0,0,0},
754 {"echo",echoCommand,2,REDIS_CMD_BULK,0,0,0},
755 {"save",saveCommand,1,REDIS_CMD_INLINE,0,0,0},
756 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
757 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,0,0,0},
758 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,0,0,0},
759 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
760 {"type",typeCommand,2,REDIS_CMD_INLINE,1,1,1},
761 {"multi",multiCommand,1,REDIS_CMD_INLINE,0,0,0},
762 {"exec",execCommand,1,REDIS_CMD_INLINE,0,0,0},
763 {"discard",discardCommand,1,REDIS_CMD_INLINE,0,0,0},
764 {"sync",syncCommand,1,REDIS_CMD_INLINE,0,0,0},
765 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,0,0,0},
766 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,0,0,0},
767 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
768 {"info",infoCommand,1,REDIS_CMD_INLINE,0,0,0},
769 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,0,0,0},
770 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,1,1,1},
771 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,0,0,0},
772 {"debug",debugCommand,-2,REDIS_CMD_INLINE,0,0,0},
773 {NULL,NULL,0,0,0,0,0}
774 };
775
776 /*============================ Utility functions ============================ */
777
778 /* Glob-style pattern matching. */
779 int stringmatchlen(const char *pattern, int patternLen,
780 const char *string, int stringLen, int nocase)
781 {
782 while(patternLen) {
783 switch(pattern[0]) {
784 case '*':
785 while (pattern[1] == '*') {
786 pattern++;
787 patternLen--;
788 }
789 if (patternLen == 1)
790 return 1; /* match */
791 while(stringLen) {
792 if (stringmatchlen(pattern+1, patternLen-1,
793 string, stringLen, nocase))
794 return 1; /* match */
795 string++;
796 stringLen--;
797 }
798 return 0; /* no match */
799 break;
800 case '?':
801 if (stringLen == 0)
802 return 0; /* no match */
803 string++;
804 stringLen--;
805 break;
806 case '[':
807 {
808 int not, match;
809
810 pattern++;
811 patternLen--;
812 not = pattern[0] == '^';
813 if (not) {
814 pattern++;
815 patternLen--;
816 }
817 match = 0;
818 while(1) {
819 if (pattern[0] == '\\') {
820 pattern++;
821 patternLen--;
822 if (pattern[0] == string[0])
823 match = 1;
824 } else if (pattern[0] == ']') {
825 break;
826 } else if (patternLen == 0) {
827 pattern--;
828 patternLen++;
829 break;
830 } else if (pattern[1] == '-' && patternLen >= 3) {
831 int start = pattern[0];
832 int end = pattern[2];
833 int c = string[0];
834 if (start > end) {
835 int t = start;
836 start = end;
837 end = t;
838 }
839 if (nocase) {
840 start = tolower(start);
841 end = tolower(end);
842 c = tolower(c);
843 }
844 pattern += 2;
845 patternLen -= 2;
846 if (c >= start && c <= end)
847 match = 1;
848 } else {
849 if (!nocase) {
850 if (pattern[0] == string[0])
851 match = 1;
852 } else {
853 if (tolower((int)pattern[0]) == tolower((int)string[0]))
854 match = 1;
855 }
856 }
857 pattern++;
858 patternLen--;
859 }
860 if (not)
861 match = !match;
862 if (!match)
863 return 0; /* no match */
864 string++;
865 stringLen--;
866 break;
867 }
868 case '\\':
869 if (patternLen >= 2) {
870 pattern++;
871 patternLen--;
872 }
873 /* fall through */
874 default:
875 if (!nocase) {
876 if (pattern[0] != string[0])
877 return 0; /* no match */
878 } else {
879 if (tolower((int)pattern[0]) != tolower((int)string[0]))
880 return 0; /* no match */
881 }
882 string++;
883 stringLen--;
884 break;
885 }
886 pattern++;
887 patternLen--;
888 if (stringLen == 0) {
889 while(*pattern == '*') {
890 pattern++;
891 patternLen--;
892 }
893 break;
894 }
895 }
896 if (patternLen == 0 && stringLen == 0)
897 return 1;
898 return 0;
899 }
900
901 static void redisLog(int level, const char *fmt, ...) {
902 va_list ap;
903 FILE *fp;
904
905 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
906 if (!fp) return;
907
908 va_start(ap, fmt);
909 if (level >= server.verbosity) {
910 char *c = ".-*#";
911 char buf[64];
912 time_t now;
913
914 now = time(NULL);
915 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
916 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
917 vfprintf(fp, fmt, ap);
918 fprintf(fp,"\n");
919 fflush(fp);
920 }
921 va_end(ap);
922
923 if (server.logfile) fclose(fp);
924 }
925
926 /*====================== Hash table type implementation ==================== */
927
928 /* This is an hash table type that uses the SDS dynamic strings libary as
929 * keys and radis objects as values (objects can hold SDS strings,
930 * lists, sets). */
931
932 static void dictVanillaFree(void *privdata, void *val)
933 {
934 DICT_NOTUSED(privdata);
935 zfree(val);
936 }
937
938 static void dictListDestructor(void *privdata, void *val)
939 {
940 DICT_NOTUSED(privdata);
941 listRelease((list*)val);
942 }
943
944 static int sdsDictKeyCompare(void *privdata, const void *key1,
945 const void *key2)
946 {
947 int l1,l2;
948 DICT_NOTUSED(privdata);
949
950 l1 = sdslen((sds)key1);
951 l2 = sdslen((sds)key2);
952 if (l1 != l2) return 0;
953 return memcmp(key1, key2, l1) == 0;
954 }
955
956 static void dictRedisObjectDestructor(void *privdata, void *val)
957 {
958 DICT_NOTUSED(privdata);
959
960 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
961 decrRefCount(val);
962 }
963
964 static int dictObjKeyCompare(void *privdata, const void *key1,
965 const void *key2)
966 {
967 const robj *o1 = key1, *o2 = key2;
968 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
969 }
970
971 static unsigned int dictObjHash(const void *key) {
972 const robj *o = key;
973 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
974 }
975
976 static int dictEncObjKeyCompare(void *privdata, const void *key1,
977 const void *key2)
978 {
979 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
980 int cmp;
981
982 o1 = getDecodedObject(o1);
983 o2 = getDecodedObject(o2);
984 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
985 decrRefCount(o1);
986 decrRefCount(o2);
987 return cmp;
988 }
989
990 static unsigned int dictEncObjHash(const void *key) {
991 robj *o = (robj*) key;
992
993 if (o->encoding == REDIS_ENCODING_RAW) {
994 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
995 } else {
996 if (o->encoding == REDIS_ENCODING_INT) {
997 char buf[32];
998 int len;
999
1000 len = snprintf(buf,32,"%ld",(long)o->ptr);
1001 return dictGenHashFunction((unsigned char*)buf, len);
1002 } else {
1003 unsigned int hash;
1004
1005 o = getDecodedObject(o);
1006 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1007 decrRefCount(o);
1008 return hash;
1009 }
1010 }
1011 }
1012
1013 /* Sets type and expires */
1014 static dictType setDictType = {
1015 dictEncObjHash, /* hash function */
1016 NULL, /* key dup */
1017 NULL, /* val dup */
1018 dictEncObjKeyCompare, /* key compare */
1019 dictRedisObjectDestructor, /* key destructor */
1020 NULL /* val destructor */
1021 };
1022
1023 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1024 static dictType zsetDictType = {
1025 dictEncObjHash, /* hash function */
1026 NULL, /* key dup */
1027 NULL, /* val dup */
1028 dictEncObjKeyCompare, /* key compare */
1029 dictRedisObjectDestructor, /* key destructor */
1030 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1031 };
1032
1033 /* Db->dict */
1034 static dictType dbDictType = {
1035 dictObjHash, /* hash function */
1036 NULL, /* key dup */
1037 NULL, /* val dup */
1038 dictObjKeyCompare, /* key compare */
1039 dictRedisObjectDestructor, /* key destructor */
1040 dictRedisObjectDestructor /* val destructor */
1041 };
1042
1043 /* Db->expires */
1044 static dictType keyptrDictType = {
1045 dictObjHash, /* hash function */
1046 NULL, /* key dup */
1047 NULL, /* val dup */
1048 dictObjKeyCompare, /* key compare */
1049 dictRedisObjectDestructor, /* key destructor */
1050 NULL /* val destructor */
1051 };
1052
1053 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1054 static dictType hashDictType = {
1055 dictEncObjHash, /* hash function */
1056 NULL, /* key dup */
1057 NULL, /* val dup */
1058 dictEncObjKeyCompare, /* key compare */
1059 dictRedisObjectDestructor, /* key destructor */
1060 dictRedisObjectDestructor /* val destructor */
1061 };
1062
1063 /* Keylist hash table type has unencoded redis objects as keys and
1064 * lists as values. It's used for blocking operations (BLPOP) and to
1065 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1066 static dictType keylistDictType = {
1067 dictObjHash, /* hash function */
1068 NULL, /* key dup */
1069 NULL, /* val dup */
1070 dictObjKeyCompare, /* key compare */
1071 dictRedisObjectDestructor, /* key destructor */
1072 dictListDestructor /* val destructor */
1073 };
1074
1075 /* ========================= Random utility functions ======================= */
1076
1077 /* Redis generally does not try to recover from out of memory conditions
1078 * when allocating objects or strings, it is not clear if it will be possible
1079 * to report this condition to the client since the networking layer itself
1080 * is based on heap allocation for send buffers, so we simply abort.
1081 * At least the code will be simpler to read... */
1082 static void oom(const char *msg) {
1083 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1084 sleep(1);
1085 abort();
1086 }
1087
1088 /* ====================== Redis server networking stuff ===================== */
1089 static void closeTimedoutClients(void) {
1090 redisClient *c;
1091 listNode *ln;
1092 time_t now = time(NULL);
1093 listIter li;
1094
1095 listRewind(server.clients,&li);
1096 while ((ln = listNext(&li)) != NULL) {
1097 c = listNodeValue(ln);
1098 if (server.maxidletime &&
1099 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1100 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1101 (now - c->lastinteraction > server.maxidletime))
1102 {
1103 redisLog(REDIS_VERBOSE,"Closing idle client");
1104 freeClient(c);
1105 } else if (c->flags & REDIS_BLOCKED) {
1106 if (c->blockingto != 0 && c->blockingto < now) {
1107 addReply(c,shared.nullmultibulk);
1108 unblockClientWaitingData(c);
1109 }
1110 }
1111 }
1112 }
1113
1114 static int htNeedsResize(dict *dict) {
1115 long long size, used;
1116
1117 size = dictSlots(dict);
1118 used = dictSize(dict);
1119 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1120 (used*100/size < REDIS_HT_MINFILL));
1121 }
1122
1123 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1124 * we resize the hash table to save memory */
1125 static void tryResizeHashTables(void) {
1126 int j;
1127
1128 for (j = 0; j < server.dbnum; j++) {
1129 if (htNeedsResize(server.db[j].dict)) {
1130 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1131 dictResize(server.db[j].dict);
1132 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1133 }
1134 if (htNeedsResize(server.db[j].expires))
1135 dictResize(server.db[j].expires);
1136 }
1137 }
1138
1139 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1140 void backgroundSaveDoneHandler(int statloc) {
1141 int exitcode = WEXITSTATUS(statloc);
1142 int bysignal = WIFSIGNALED(statloc);
1143
1144 if (!bysignal && exitcode == 0) {
1145 redisLog(REDIS_NOTICE,
1146 "Background saving terminated with success");
1147 server.dirty = 0;
1148 server.lastsave = time(NULL);
1149 } else if (!bysignal && exitcode != 0) {
1150 redisLog(REDIS_WARNING, "Background saving error");
1151 } else {
1152 redisLog(REDIS_WARNING,
1153 "Background saving terminated by signal");
1154 rdbRemoveTempFile(server.bgsavechildpid);
1155 }
1156 server.bgsavechildpid = -1;
1157 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1158 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1159 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1160 }
1161
1162 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1163 * Handle this. */
1164 void backgroundRewriteDoneHandler(int statloc) {
1165 int exitcode = WEXITSTATUS(statloc);
1166 int bysignal = WIFSIGNALED(statloc);
1167
1168 if (!bysignal && exitcode == 0) {
1169 int fd;
1170 char tmpfile[256];
1171
1172 redisLog(REDIS_NOTICE,
1173 "Background append only file rewriting terminated with success");
1174 /* Now it's time to flush the differences accumulated by the parent */
1175 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1176 fd = open(tmpfile,O_WRONLY|O_APPEND);
1177 if (fd == -1) {
1178 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1179 goto cleanup;
1180 }
1181 /* Flush our data... */
1182 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1183 (signed) sdslen(server.bgrewritebuf)) {
1184 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1185 close(fd);
1186 goto cleanup;
1187 }
1188 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1189 /* Now our work is to rename the temp file into the stable file. And
1190 * switch the file descriptor used by the server for append only. */
1191 if (rename(tmpfile,server.appendfilename) == -1) {
1192 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1193 close(fd);
1194 goto cleanup;
1195 }
1196 /* Mission completed... almost */
1197 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1198 if (server.appendfd != -1) {
1199 /* If append only is actually enabled... */
1200 close(server.appendfd);
1201 server.appendfd = fd;
1202 fsync(fd);
1203 server.appendseldb = -1; /* Make sure it will issue SELECT */
1204 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1205 } else {
1206 /* If append only is disabled we just generate a dump in this
1207 * format. Why not? */
1208 close(fd);
1209 }
1210 } else if (!bysignal && exitcode != 0) {
1211 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1212 } else {
1213 redisLog(REDIS_WARNING,
1214 "Background append only file rewriting terminated by signal");
1215 }
1216 cleanup:
1217 sdsfree(server.bgrewritebuf);
1218 server.bgrewritebuf = sdsempty();
1219 aofRemoveTempFile(server.bgrewritechildpid);
1220 server.bgrewritechildpid = -1;
1221 }
1222
1223 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1224 int j, loops = server.cronloops++;
1225 REDIS_NOTUSED(eventLoop);
1226 REDIS_NOTUSED(id);
1227 REDIS_NOTUSED(clientData);
1228
1229 /* We take a cached value of the unix time in the global state because
1230 * with virtual memory and aging there is to store the current time
1231 * in objects at every object access, and accuracy is not needed.
1232 * To access a global var is faster than calling time(NULL) */
1233 server.unixtime = time(NULL);
1234
1235 /* Show some info about non-empty databases */
1236 for (j = 0; j < server.dbnum; j++) {
1237 long long size, used, vkeys;
1238
1239 size = dictSlots(server.db[j].dict);
1240 used = dictSize(server.db[j].dict);
1241 vkeys = dictSize(server.db[j].expires);
1242 if (!(loops % 5) && (used || vkeys)) {
1243 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1244 /* dictPrintStats(server.dict); */
1245 }
1246 }
1247
1248 /* We don't want to resize the hash tables while a bacground saving
1249 * is in progress: the saving child is created using fork() that is
1250 * implemented with a copy-on-write semantic in most modern systems, so
1251 * if we resize the HT while there is the saving child at work actually
1252 * a lot of memory movements in the parent will cause a lot of pages
1253 * copied. */
1254 if (server.bgsavechildpid == -1) tryResizeHashTables();
1255
1256 /* Show information about connected clients */
1257 if (!(loops % 5)) {
1258 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1259 listLength(server.clients)-listLength(server.slaves),
1260 listLength(server.slaves),
1261 zmalloc_used_memory(),
1262 dictSize(server.sharingpool));
1263 }
1264
1265 /* Close connections of timedout clients */
1266 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
1267 closeTimedoutClients();
1268
1269 /* Check if a background saving or AOF rewrite in progress terminated */
1270 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1271 int statloc;
1272 pid_t pid;
1273
1274 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1275 if (pid == server.bgsavechildpid) {
1276 backgroundSaveDoneHandler(statloc);
1277 } else {
1278 backgroundRewriteDoneHandler(statloc);
1279 }
1280 }
1281 } else {
1282 /* If there is not a background saving in progress check if
1283 * we have to save now */
1284 time_t now = time(NULL);
1285 for (j = 0; j < server.saveparamslen; j++) {
1286 struct saveparam *sp = server.saveparams+j;
1287
1288 if (server.dirty >= sp->changes &&
1289 now-server.lastsave > sp->seconds) {
1290 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1291 sp->changes, sp->seconds);
1292 rdbSaveBackground(server.dbfilename);
1293 break;
1294 }
1295 }
1296 }
1297
1298 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1299 * will use few CPU cycles if there are few expiring keys, otherwise
1300 * it will get more aggressive to avoid that too much memory is used by
1301 * keys that can be removed from the keyspace. */
1302 for (j = 0; j < server.dbnum; j++) {
1303 int expired;
1304 redisDb *db = server.db+j;
1305
1306 /* Continue to expire if at the end of the cycle more than 25%
1307 * of the keys were expired. */
1308 do {
1309 long num = dictSize(db->expires);
1310 time_t now = time(NULL);
1311
1312 expired = 0;
1313 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1314 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1315 while (num--) {
1316 dictEntry *de;
1317 time_t t;
1318
1319 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1320 t = (time_t) dictGetEntryVal(de);
1321 if (now > t) {
1322 deleteKey(db,dictGetEntryKey(de));
1323 expired++;
1324 }
1325 }
1326 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1327 }
1328
1329 /* Swap a few keys on disk if we are over the memory limit and VM
1330 * is enbled. Try to free objects from the free list first. */
1331 if (vmCanSwapOut()) {
1332 while (server.vm_enabled && zmalloc_used_memory() >
1333 server.vm_max_memory)
1334 {
1335 int retval;
1336
1337 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1338 retval = (server.vm_max_threads == 0) ?
1339 vmSwapOneObjectBlocking() :
1340 vmSwapOneObjectThreaded();
1341 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1342 zmalloc_used_memory() >
1343 (server.vm_max_memory+server.vm_max_memory/10))
1344 {
1345 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1346 }
1347 /* Note that when using threade I/O we free just one object,
1348 * because anyway when the I/O thread in charge to swap this
1349 * object out will finish, the handler of completed jobs
1350 * will try to swap more objects if we are still out of memory. */
1351 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1352 }
1353 }
1354
1355 /* Check if we should connect to a MASTER */
1356 if (server.replstate == REDIS_REPL_CONNECT) {
1357 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1358 if (syncWithMaster() == REDIS_OK) {
1359 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1360 }
1361 }
1362 return 1000;
1363 }
1364
1365 /* This function gets called every time Redis is entering the
1366 * main loop of the event driven library, that is, before to sleep
1367 * for ready file descriptors. */
1368 static void beforeSleep(struct aeEventLoop *eventLoop) {
1369 REDIS_NOTUSED(eventLoop);
1370
1371 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1372 listIter li;
1373 listNode *ln;
1374
1375 listRewind(server.io_ready_clients,&li);
1376 while((ln = listNext(&li))) {
1377 redisClient *c = ln->value;
1378 struct redisCommand *cmd;
1379
1380 /* Resume the client. */
1381 listDelNode(server.io_ready_clients,ln);
1382 c->flags &= (~REDIS_IO_WAIT);
1383 server.vm_blocked_clients--;
1384 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1385 readQueryFromClient, c);
1386 cmd = lookupCommand(c->argv[0]->ptr);
1387 assert(cmd != NULL);
1388 call(c,cmd);
1389 resetClient(c);
1390 /* There may be more data to process in the input buffer. */
1391 if (c->querybuf && sdslen(c->querybuf) > 0)
1392 processInputBuffer(c);
1393 }
1394 }
1395 }
1396
1397 static void createSharedObjects(void) {
1398 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1399 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1400 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1401 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1402 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1403 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1404 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1405 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1406 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1407 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1408 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1409 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1410 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1411 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1412 "-ERR no such key\r\n"));
1413 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1414 "-ERR syntax error\r\n"));
1415 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1416 "-ERR source and destination objects are the same\r\n"));
1417 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1418 "-ERR index out of range\r\n"));
1419 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1420 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1421 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1422 shared.select0 = createStringObject("select 0\r\n",10);
1423 shared.select1 = createStringObject("select 1\r\n",10);
1424 shared.select2 = createStringObject("select 2\r\n",10);
1425 shared.select3 = createStringObject("select 3\r\n",10);
1426 shared.select4 = createStringObject("select 4\r\n",10);
1427 shared.select5 = createStringObject("select 5\r\n",10);
1428 shared.select6 = createStringObject("select 6\r\n",10);
1429 shared.select7 = createStringObject("select 7\r\n",10);
1430 shared.select8 = createStringObject("select 8\r\n",10);
1431 shared.select9 = createStringObject("select 9\r\n",10);
1432 }
1433
1434 static void appendServerSaveParams(time_t seconds, int changes) {
1435 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1436 server.saveparams[server.saveparamslen].seconds = seconds;
1437 server.saveparams[server.saveparamslen].changes = changes;
1438 server.saveparamslen++;
1439 }
1440
1441 static void resetServerSaveParams() {
1442 zfree(server.saveparams);
1443 server.saveparams = NULL;
1444 server.saveparamslen = 0;
1445 }
1446
1447 static void initServerConfig() {
1448 server.dbnum = REDIS_DEFAULT_DBNUM;
1449 server.port = REDIS_SERVERPORT;
1450 server.verbosity = REDIS_VERBOSE;
1451 server.maxidletime = REDIS_MAXIDLETIME;
1452 server.saveparams = NULL;
1453 server.logfile = NULL; /* NULL = log on standard output */
1454 server.bindaddr = NULL;
1455 server.glueoutputbuf = 1;
1456 server.daemonize = 0;
1457 server.appendonly = 0;
1458 server.appendfsync = APPENDFSYNC_ALWAYS;
1459 server.lastfsync = time(NULL);
1460 server.appendfd = -1;
1461 server.appendseldb = -1; /* Make sure the first time will not match */
1462 server.pidfile = "/var/run/redis.pid";
1463 server.dbfilename = "dump.rdb";
1464 server.appendfilename = "appendonly.aof";
1465 server.requirepass = NULL;
1466 server.shareobjects = 0;
1467 server.rdbcompression = 1;
1468 server.sharingpoolsize = 1024;
1469 server.maxclients = 0;
1470 server.blpop_blocked_clients = 0;
1471 server.maxmemory = 0;
1472 server.vm_enabled = 0;
1473 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1474 server.vm_page_size = 256; /* 256 bytes per page */
1475 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1476 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1477 server.vm_max_threads = 4;
1478 server.vm_blocked_clients = 0;
1479 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1480 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1481
1482 resetServerSaveParams();
1483
1484 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1485 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1486 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1487 /* Replication related */
1488 server.isslave = 0;
1489 server.masterauth = NULL;
1490 server.masterhost = NULL;
1491 server.masterport = 6379;
1492 server.master = NULL;
1493 server.replstate = REDIS_REPL_NONE;
1494
1495 /* Double constants initialization */
1496 R_Zero = 0.0;
1497 R_PosInf = 1.0/R_Zero;
1498 R_NegInf = -1.0/R_Zero;
1499 R_Nan = R_Zero/R_Zero;
1500 }
1501
1502 static void initServer() {
1503 int j;
1504
1505 signal(SIGHUP, SIG_IGN);
1506 signal(SIGPIPE, SIG_IGN);
1507 setupSigSegvAction();
1508
1509 server.devnull = fopen("/dev/null","w");
1510 if (server.devnull == NULL) {
1511 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1512 exit(1);
1513 }
1514 server.clients = listCreate();
1515 server.slaves = listCreate();
1516 server.monitors = listCreate();
1517 server.objfreelist = listCreate();
1518 createSharedObjects();
1519 server.el = aeCreateEventLoop();
1520 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1521 server.sharingpool = dictCreate(&setDictType,NULL);
1522 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1523 if (server.fd == -1) {
1524 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1525 exit(1);
1526 }
1527 for (j = 0; j < server.dbnum; j++) {
1528 server.db[j].dict = dictCreate(&dbDictType,NULL);
1529 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1530 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1531 if (server.vm_enabled)
1532 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1533 server.db[j].id = j;
1534 }
1535 server.cronloops = 0;
1536 server.bgsavechildpid = -1;
1537 server.bgrewritechildpid = -1;
1538 server.bgrewritebuf = sdsempty();
1539 server.lastsave = time(NULL);
1540 server.dirty = 0;
1541 server.stat_numcommands = 0;
1542 server.stat_numconnections = 0;
1543 server.stat_starttime = time(NULL);
1544 server.unixtime = time(NULL);
1545 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1546 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1547 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1548
1549 if (server.appendonly) {
1550 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1551 if (server.appendfd == -1) {
1552 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1553 strerror(errno));
1554 exit(1);
1555 }
1556 }
1557
1558 if (server.vm_enabled) vmInit();
1559 }
1560
1561 /* Empty the whole database */
1562 static long long emptyDb() {
1563 int j;
1564 long long removed = 0;
1565
1566 for (j = 0; j < server.dbnum; j++) {
1567 removed += dictSize(server.db[j].dict);
1568 dictEmpty(server.db[j].dict);
1569 dictEmpty(server.db[j].expires);
1570 }
1571 return removed;
1572 }
1573
1574 static int yesnotoi(char *s) {
1575 if (!strcasecmp(s,"yes")) return 1;
1576 else if (!strcasecmp(s,"no")) return 0;
1577 else return -1;
1578 }
1579
1580 /* I agree, this is a very rudimental way to load a configuration...
1581 will improve later if the config gets more complex */
1582 static void loadServerConfig(char *filename) {
1583 FILE *fp;
1584 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1585 int linenum = 0;
1586 sds line = NULL;
1587
1588 if (filename[0] == '-' && filename[1] == '\0')
1589 fp = stdin;
1590 else {
1591 if ((fp = fopen(filename,"r")) == NULL) {
1592 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1593 exit(1);
1594 }
1595 }
1596
1597 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1598 sds *argv;
1599 int argc, j;
1600
1601 linenum++;
1602 line = sdsnew(buf);
1603 line = sdstrim(line," \t\r\n");
1604
1605 /* Skip comments and blank lines*/
1606 if (line[0] == '#' || line[0] == '\0') {
1607 sdsfree(line);
1608 continue;
1609 }
1610
1611 /* Split into arguments */
1612 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1613 sdstolower(argv[0]);
1614
1615 /* Execute config directives */
1616 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1617 server.maxidletime = atoi(argv[1]);
1618 if (server.maxidletime < 0) {
1619 err = "Invalid timeout value"; goto loaderr;
1620 }
1621 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1622 server.port = atoi(argv[1]);
1623 if (server.port < 1 || server.port > 65535) {
1624 err = "Invalid port"; goto loaderr;
1625 }
1626 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1627 server.bindaddr = zstrdup(argv[1]);
1628 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1629 int seconds = atoi(argv[1]);
1630 int changes = atoi(argv[2]);
1631 if (seconds < 1 || changes < 0) {
1632 err = "Invalid save parameters"; goto loaderr;
1633 }
1634 appendServerSaveParams(seconds,changes);
1635 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1636 if (chdir(argv[1]) == -1) {
1637 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1638 argv[1], strerror(errno));
1639 exit(1);
1640 }
1641 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1642 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1643 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1644 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1645 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1646 else {
1647 err = "Invalid log level. Must be one of debug, notice, warning";
1648 goto loaderr;
1649 }
1650 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1651 FILE *logfp;
1652
1653 server.logfile = zstrdup(argv[1]);
1654 if (!strcasecmp(server.logfile,"stdout")) {
1655 zfree(server.logfile);
1656 server.logfile = NULL;
1657 }
1658 if (server.logfile) {
1659 /* Test if we are able to open the file. The server will not
1660 * be able to abort just for this problem later... */
1661 logfp = fopen(server.logfile,"a");
1662 if (logfp == NULL) {
1663 err = sdscatprintf(sdsempty(),
1664 "Can't open the log file: %s", strerror(errno));
1665 goto loaderr;
1666 }
1667 fclose(logfp);
1668 }
1669 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1670 server.dbnum = atoi(argv[1]);
1671 if (server.dbnum < 1) {
1672 err = "Invalid number of databases"; goto loaderr;
1673 }
1674 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1675 server.maxclients = atoi(argv[1]);
1676 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1677 server.maxmemory = strtoll(argv[1], NULL, 10);
1678 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1679 server.masterhost = sdsnew(argv[1]);
1680 server.masterport = atoi(argv[2]);
1681 server.replstate = REDIS_REPL_CONNECT;
1682 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1683 server.masterauth = zstrdup(argv[1]);
1684 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1685 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1686 err = "argument must be 'yes' or 'no'"; goto loaderr;
1687 }
1688 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1689 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1690 err = "argument must be 'yes' or 'no'"; goto loaderr;
1691 }
1692 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1693 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1694 err = "argument must be 'yes' or 'no'"; goto loaderr;
1695 }
1696 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1697 server.sharingpoolsize = atoi(argv[1]);
1698 if (server.sharingpoolsize < 1) {
1699 err = "invalid object sharing pool size"; goto loaderr;
1700 }
1701 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1702 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1703 err = "argument must be 'yes' or 'no'"; goto loaderr;
1704 }
1705 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1706 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1707 err = "argument must be 'yes' or 'no'"; goto loaderr;
1708 }
1709 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1710 if (!strcasecmp(argv[1],"no")) {
1711 server.appendfsync = APPENDFSYNC_NO;
1712 } else if (!strcasecmp(argv[1],"always")) {
1713 server.appendfsync = APPENDFSYNC_ALWAYS;
1714 } else if (!strcasecmp(argv[1],"everysec")) {
1715 server.appendfsync = APPENDFSYNC_EVERYSEC;
1716 } else {
1717 err = "argument must be 'no', 'always' or 'everysec'";
1718 goto loaderr;
1719 }
1720 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1721 server.requirepass = zstrdup(argv[1]);
1722 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1723 server.pidfile = zstrdup(argv[1]);
1724 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1725 server.dbfilename = zstrdup(argv[1]);
1726 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1727 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1728 err = "argument must be 'yes' or 'no'"; goto loaderr;
1729 }
1730 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1731 zfree(server.vm_swap_file);
1732 server.vm_swap_file = zstrdup(argv[1]);
1733 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1734 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1735 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1736 server.vm_page_size = strtoll(argv[1], NULL, 10);
1737 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1738 server.vm_pages = strtoll(argv[1], NULL, 10);
1739 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1740 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1741 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1742 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1743 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1744 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1745 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1746 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1747 } else {
1748 err = "Bad directive or wrong number of arguments"; goto loaderr;
1749 }
1750 for (j = 0; j < argc; j++)
1751 sdsfree(argv[j]);
1752 zfree(argv);
1753 sdsfree(line);
1754 }
1755 if (fp != stdin) fclose(fp);
1756 return;
1757
1758 loaderr:
1759 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1760 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1761 fprintf(stderr, ">>> '%s'\n", line);
1762 fprintf(stderr, "%s\n", err);
1763 exit(1);
1764 }
1765
1766 static void freeClientArgv(redisClient *c) {
1767 int j;
1768
1769 for (j = 0; j < c->argc; j++)
1770 decrRefCount(c->argv[j]);
1771 for (j = 0; j < c->mbargc; j++)
1772 decrRefCount(c->mbargv[j]);
1773 c->argc = 0;
1774 c->mbargc = 0;
1775 }
1776
1777 static void freeClient(redisClient *c) {
1778 listNode *ln;
1779
1780 /* Note that if the client we are freeing is blocked into a blocking
1781 * call, we have to set querybuf to NULL *before* to call
1782 * unblockClientWaitingData() to avoid processInputBuffer() will get
1783 * called. Also it is important to remove the file events after
1784 * this, because this call adds the READABLE event. */
1785 sdsfree(c->querybuf);
1786 c->querybuf = NULL;
1787 if (c->flags & REDIS_BLOCKED)
1788 unblockClientWaitingData(c);
1789
1790 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1791 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1792 listRelease(c->reply);
1793 freeClientArgv(c);
1794 close(c->fd);
1795 /* Remove from the list of clients */
1796 ln = listSearchKey(server.clients,c);
1797 redisAssert(ln != NULL);
1798 listDelNode(server.clients,ln);
1799 /* Remove from the list of clients waiting for swapped keys */
1800 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1801 ln = listSearchKey(server.io_ready_clients,c);
1802 if (ln) {
1803 listDelNode(server.io_ready_clients,ln);
1804 server.vm_blocked_clients--;
1805 }
1806 }
1807 while (server.vm_enabled && listLength(c->io_keys)) {
1808 ln = listFirst(c->io_keys);
1809 dontWaitForSwappedKey(c,ln->value);
1810 }
1811 listRelease(c->io_keys);
1812 /* Other cleanup */
1813 if (c->flags & REDIS_SLAVE) {
1814 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1815 close(c->repldbfd);
1816 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1817 ln = listSearchKey(l,c);
1818 redisAssert(ln != NULL);
1819 listDelNode(l,ln);
1820 }
1821 if (c->flags & REDIS_MASTER) {
1822 server.master = NULL;
1823 server.replstate = REDIS_REPL_CONNECT;
1824 }
1825 zfree(c->argv);
1826 zfree(c->mbargv);
1827 freeClientMultiState(c);
1828 zfree(c);
1829 }
1830
1831 #define GLUEREPLY_UP_TO (1024)
1832 static void glueReplyBuffersIfNeeded(redisClient *c) {
1833 int copylen = 0;
1834 char buf[GLUEREPLY_UP_TO];
1835 listNode *ln;
1836 listIter li;
1837 robj *o;
1838
1839 listRewind(c->reply,&li);
1840 while((ln = listNext(&li))) {
1841 int objlen;
1842
1843 o = ln->value;
1844 objlen = sdslen(o->ptr);
1845 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1846 memcpy(buf+copylen,o->ptr,objlen);
1847 copylen += objlen;
1848 listDelNode(c->reply,ln);
1849 } else {
1850 if (copylen == 0) return;
1851 break;
1852 }
1853 }
1854 /* Now the output buffer is empty, add the new single element */
1855 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1856 listAddNodeHead(c->reply,o);
1857 }
1858
1859 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1860 redisClient *c = privdata;
1861 int nwritten = 0, totwritten = 0, objlen;
1862 robj *o;
1863 REDIS_NOTUSED(el);
1864 REDIS_NOTUSED(mask);
1865
1866 /* Use writev() if we have enough buffers to send */
1867 if (!server.glueoutputbuf &&
1868 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1869 !(c->flags & REDIS_MASTER))
1870 {
1871 sendReplyToClientWritev(el, fd, privdata, mask);
1872 return;
1873 }
1874
1875 while(listLength(c->reply)) {
1876 if (server.glueoutputbuf && listLength(c->reply) > 1)
1877 glueReplyBuffersIfNeeded(c);
1878
1879 o = listNodeValue(listFirst(c->reply));
1880 objlen = sdslen(o->ptr);
1881
1882 if (objlen == 0) {
1883 listDelNode(c->reply,listFirst(c->reply));
1884 continue;
1885 }
1886
1887 if (c->flags & REDIS_MASTER) {
1888 /* Don't reply to a master */
1889 nwritten = objlen - c->sentlen;
1890 } else {
1891 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1892 if (nwritten <= 0) break;
1893 }
1894 c->sentlen += nwritten;
1895 totwritten += nwritten;
1896 /* If we fully sent the object on head go to the next one */
1897 if (c->sentlen == objlen) {
1898 listDelNode(c->reply,listFirst(c->reply));
1899 c->sentlen = 0;
1900 }
1901 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1902 * bytes, in a single threaded server it's a good idea to serve
1903 * other clients as well, even if a very large request comes from
1904 * super fast link that is always able to accept data (in real world
1905 * scenario think about 'KEYS *' against the loopback interfae) */
1906 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1907 }
1908 if (nwritten == -1) {
1909 if (errno == EAGAIN) {
1910 nwritten = 0;
1911 } else {
1912 redisLog(REDIS_VERBOSE,
1913 "Error writing to client: %s", strerror(errno));
1914 freeClient(c);
1915 return;
1916 }
1917 }
1918 if (totwritten > 0) c->lastinteraction = time(NULL);
1919 if (listLength(c->reply) == 0) {
1920 c->sentlen = 0;
1921 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1922 }
1923 }
1924
1925 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1926 {
1927 redisClient *c = privdata;
1928 int nwritten = 0, totwritten = 0, objlen, willwrite;
1929 robj *o;
1930 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1931 int offset, ion = 0;
1932 REDIS_NOTUSED(el);
1933 REDIS_NOTUSED(mask);
1934
1935 listNode *node;
1936 while (listLength(c->reply)) {
1937 offset = c->sentlen;
1938 ion = 0;
1939 willwrite = 0;
1940
1941 /* fill-in the iov[] array */
1942 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1943 o = listNodeValue(node);
1944 objlen = sdslen(o->ptr);
1945
1946 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1947 break;
1948
1949 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1950 break; /* no more iovecs */
1951
1952 iov[ion].iov_base = ((char*)o->ptr) + offset;
1953 iov[ion].iov_len = objlen - offset;
1954 willwrite += objlen - offset;
1955 offset = 0; /* just for the first item */
1956 ion++;
1957 }
1958
1959 if(willwrite == 0)
1960 break;
1961
1962 /* write all collected blocks at once */
1963 if((nwritten = writev(fd, iov, ion)) < 0) {
1964 if (errno != EAGAIN) {
1965 redisLog(REDIS_VERBOSE,
1966 "Error writing to client: %s", strerror(errno));
1967 freeClient(c);
1968 return;
1969 }
1970 break;
1971 }
1972
1973 totwritten += nwritten;
1974 offset = c->sentlen;
1975
1976 /* remove written robjs from c->reply */
1977 while (nwritten && listLength(c->reply)) {
1978 o = listNodeValue(listFirst(c->reply));
1979 objlen = sdslen(o->ptr);
1980
1981 if(nwritten >= objlen - offset) {
1982 listDelNode(c->reply, listFirst(c->reply));
1983 nwritten -= objlen - offset;
1984 c->sentlen = 0;
1985 } else {
1986 /* partial write */
1987 c->sentlen += nwritten;
1988 break;
1989 }
1990 offset = 0;
1991 }
1992 }
1993
1994 if (totwritten > 0)
1995 c->lastinteraction = time(NULL);
1996
1997 if (listLength(c->reply) == 0) {
1998 c->sentlen = 0;
1999 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2000 }
2001 }
2002
2003 static struct redisCommand *lookupCommand(char *name) {
2004 int j = 0;
2005 while(cmdTable[j].name != NULL) {
2006 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2007 j++;
2008 }
2009 return NULL;
2010 }
2011
2012 /* resetClient prepare the client to process the next command */
2013 static void resetClient(redisClient *c) {
2014 freeClientArgv(c);
2015 c->bulklen = -1;
2016 c->multibulk = 0;
2017 }
2018
2019 /* Call() is the core of Redis execution of a command */
2020 static void call(redisClient *c, struct redisCommand *cmd) {
2021 long long dirty;
2022
2023 dirty = server.dirty;
2024 cmd->proc(c);
2025 if (server.appendonly && server.dirty-dirty)
2026 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2027 if (server.dirty-dirty && listLength(server.slaves))
2028 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2029 if (listLength(server.monitors))
2030 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2031 server.stat_numcommands++;
2032 }
2033
2034 /* If this function gets called we already read a whole
2035 * command, argments are in the client argv/argc fields.
2036 * processCommand() execute the command or prepare the
2037 * server for a bulk read from the client.
2038 *
2039 * If 1 is returned the client is still alive and valid and
2040 * and other operations can be performed by the caller. Otherwise
2041 * if 0 is returned the client was destroied (i.e. after QUIT). */
2042 static int processCommand(redisClient *c) {
2043 struct redisCommand *cmd;
2044
2045 /* Free some memory if needed (maxmemory setting) */
2046 if (server.maxmemory) freeMemoryIfNeeded();
2047
2048 /* Handle the multi bulk command type. This is an alternative protocol
2049 * supported by Redis in order to receive commands that are composed of
2050 * multiple binary-safe "bulk" arguments. The latency of processing is
2051 * a bit higher but this allows things like multi-sets, so if this
2052 * protocol is used only for MSET and similar commands this is a big win. */
2053 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2054 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2055 if (c->multibulk <= 0) {
2056 resetClient(c);
2057 return 1;
2058 } else {
2059 decrRefCount(c->argv[c->argc-1]);
2060 c->argc--;
2061 return 1;
2062 }
2063 } else if (c->multibulk) {
2064 if (c->bulklen == -1) {
2065 if (((char*)c->argv[0]->ptr)[0] != '$') {
2066 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2067 resetClient(c);
2068 return 1;
2069 } else {
2070 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2071 decrRefCount(c->argv[0]);
2072 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2073 c->argc--;
2074 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2075 resetClient(c);
2076 return 1;
2077 }
2078 c->argc--;
2079 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2080 return 1;
2081 }
2082 } else {
2083 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2084 c->mbargv[c->mbargc] = c->argv[0];
2085 c->mbargc++;
2086 c->argc--;
2087 c->multibulk--;
2088 if (c->multibulk == 0) {
2089 robj **auxargv;
2090 int auxargc;
2091
2092 /* Here we need to swap the multi-bulk argc/argv with the
2093 * normal argc/argv of the client structure. */
2094 auxargv = c->argv;
2095 c->argv = c->mbargv;
2096 c->mbargv = auxargv;
2097
2098 auxargc = c->argc;
2099 c->argc = c->mbargc;
2100 c->mbargc = auxargc;
2101
2102 /* We need to set bulklen to something different than -1
2103 * in order for the code below to process the command without
2104 * to try to read the last argument of a bulk command as
2105 * a special argument. */
2106 c->bulklen = 0;
2107 /* continue below and process the command */
2108 } else {
2109 c->bulklen = -1;
2110 return 1;
2111 }
2112 }
2113 }
2114 /* -- end of multi bulk commands processing -- */
2115
2116 /* The QUIT command is handled as a special case. Normal command
2117 * procs are unable to close the client connection safely */
2118 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2119 freeClient(c);
2120 return 0;
2121 }
2122
2123 /* Now lookup the command and check ASAP about trivial error conditions
2124 * such wrong arity, bad command name and so forth. */
2125 cmd = lookupCommand(c->argv[0]->ptr);
2126 if (!cmd) {
2127 addReplySds(c,
2128 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2129 (char*)c->argv[0]->ptr));
2130 resetClient(c);
2131 return 1;
2132 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2133 (c->argc < -cmd->arity)) {
2134 addReplySds(c,
2135 sdscatprintf(sdsempty(),
2136 "-ERR wrong number of arguments for '%s' command\r\n",
2137 cmd->name));
2138 resetClient(c);
2139 return 1;
2140 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2141 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2142 resetClient(c);
2143 return 1;
2144 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2145 /* This is a bulk command, we have to read the last argument yet. */
2146 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2147
2148 decrRefCount(c->argv[c->argc-1]);
2149 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2150 c->argc--;
2151 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2152 resetClient(c);
2153 return 1;
2154 }
2155 c->argc--;
2156 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2157 /* It is possible that the bulk read is already in the
2158 * buffer. Check this condition and handle it accordingly.
2159 * This is just a fast path, alternative to call processInputBuffer().
2160 * It's a good idea since the code is small and this condition
2161 * happens most of the times. */
2162 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2163 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2164 c->argc++;
2165 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2166 } else {
2167 /* Otherwise return... there is to read the last argument
2168 * from the socket. */
2169 return 1;
2170 }
2171 }
2172 /* Let's try to share objects on the command arguments vector */
2173 if (server.shareobjects) {
2174 int j;
2175 for(j = 1; j < c->argc; j++)
2176 c->argv[j] = tryObjectSharing(c->argv[j]);
2177 }
2178 /* Let's try to encode the bulk object to save space. */
2179 if (cmd->flags & REDIS_CMD_BULK)
2180 tryObjectEncoding(c->argv[c->argc-1]);
2181
2182 /* Check if the user is authenticated */
2183 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2184 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2185 resetClient(c);
2186 return 1;
2187 }
2188
2189 /* Exec the command */
2190 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2191 queueMultiCommand(c,cmd);
2192 addReply(c,shared.queued);
2193 } else {
2194 if (server.vm_enabled && server.vm_max_threads > 0 &&
2195 blockClientOnSwappedKeys(cmd,c)) return 1;
2196 call(c,cmd);
2197 }
2198
2199 /* Prepare the client for the next command */
2200 resetClient(c);
2201 return 1;
2202 }
2203
2204 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2205 listNode *ln;
2206 listIter li;
2207 int outc = 0, j;
2208 robj **outv;
2209 /* (args*2)+1 is enough room for args, spaces, newlines */
2210 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2211
2212 if (argc <= REDIS_STATIC_ARGS) {
2213 outv = static_outv;
2214 } else {
2215 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2216 }
2217
2218 for (j = 0; j < argc; j++) {
2219 if (j != 0) outv[outc++] = shared.space;
2220 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2221 robj *lenobj;
2222
2223 lenobj = createObject(REDIS_STRING,
2224 sdscatprintf(sdsempty(),"%lu\r\n",
2225 (unsigned long) stringObjectLen(argv[j])));
2226 lenobj->refcount = 0;
2227 outv[outc++] = lenobj;
2228 }
2229 outv[outc++] = argv[j];
2230 }
2231 outv[outc++] = shared.crlf;
2232
2233 /* Increment all the refcounts at start and decrement at end in order to
2234 * be sure to free objects if there is no slave in a replication state
2235 * able to be feed with commands */
2236 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2237 listRewind(slaves,&li);
2238 while((ln = listNext(&li))) {
2239 redisClient *slave = ln->value;
2240
2241 /* Don't feed slaves that are still waiting for BGSAVE to start */
2242 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2243
2244 /* Feed all the other slaves, MONITORs and so on */
2245 if (slave->slaveseldb != dictid) {
2246 robj *selectcmd;
2247
2248 switch(dictid) {
2249 case 0: selectcmd = shared.select0; break;
2250 case 1: selectcmd = shared.select1; break;
2251 case 2: selectcmd = shared.select2; break;
2252 case 3: selectcmd = shared.select3; break;
2253 case 4: selectcmd = shared.select4; break;
2254 case 5: selectcmd = shared.select5; break;
2255 case 6: selectcmd = shared.select6; break;
2256 case 7: selectcmd = shared.select7; break;
2257 case 8: selectcmd = shared.select8; break;
2258 case 9: selectcmd = shared.select9; break;
2259 default:
2260 selectcmd = createObject(REDIS_STRING,
2261 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2262 selectcmd->refcount = 0;
2263 break;
2264 }
2265 addReply(slave,selectcmd);
2266 slave->slaveseldb = dictid;
2267 }
2268 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2269 }
2270 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2271 if (outv != static_outv) zfree(outv);
2272 }
2273
2274 static void processInputBuffer(redisClient *c) {
2275 again:
2276 /* Before to process the input buffer, make sure the client is not
2277 * waitig for a blocking operation such as BLPOP. Note that the first
2278 * iteration the client is never blocked, otherwise the processInputBuffer
2279 * would not be called at all, but after the execution of the first commands
2280 * in the input buffer the client may be blocked, and the "goto again"
2281 * will try to reiterate. The following line will make it return asap. */
2282 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2283 if (c->bulklen == -1) {
2284 /* Read the first line of the query */
2285 char *p = strchr(c->querybuf,'\n');
2286 size_t querylen;
2287
2288 if (p) {
2289 sds query, *argv;
2290 int argc, j;
2291
2292 query = c->querybuf;
2293 c->querybuf = sdsempty();
2294 querylen = 1+(p-(query));
2295 if (sdslen(query) > querylen) {
2296 /* leave data after the first line of the query in the buffer */
2297 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2298 }
2299 *p = '\0'; /* remove "\n" */
2300 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2301 sdsupdatelen(query);
2302
2303 /* Now we can split the query in arguments */
2304 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2305 sdsfree(query);
2306
2307 if (c->argv) zfree(c->argv);
2308 c->argv = zmalloc(sizeof(robj*)*argc);
2309
2310 for (j = 0; j < argc; j++) {
2311 if (sdslen(argv[j])) {
2312 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2313 c->argc++;
2314 } else {
2315 sdsfree(argv[j]);
2316 }
2317 }
2318 zfree(argv);
2319 if (c->argc) {
2320 /* Execute the command. If the client is still valid
2321 * after processCommand() return and there is something
2322 * on the query buffer try to process the next command. */
2323 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2324 } else {
2325 /* Nothing to process, argc == 0. Just process the query
2326 * buffer if it's not empty or return to the caller */
2327 if (sdslen(c->querybuf)) goto again;
2328 }
2329 return;
2330 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2331 redisLog(REDIS_VERBOSE, "Client protocol error");
2332 freeClient(c);
2333 return;
2334 }
2335 } else {
2336 /* Bulk read handling. Note that if we are at this point
2337 the client already sent a command terminated with a newline,
2338 we are reading the bulk data that is actually the last
2339 argument of the command. */
2340 int qbl = sdslen(c->querybuf);
2341
2342 if (c->bulklen <= qbl) {
2343 /* Copy everything but the final CRLF as final argument */
2344 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2345 c->argc++;
2346 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2347 /* Process the command. If the client is still valid after
2348 * the processing and there is more data in the buffer
2349 * try to parse it. */
2350 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2351 return;
2352 }
2353 }
2354 }
2355
2356 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2357 redisClient *c = (redisClient*) privdata;
2358 char buf[REDIS_IOBUF_LEN];
2359 int nread;
2360 REDIS_NOTUSED(el);
2361 REDIS_NOTUSED(mask);
2362
2363 nread = read(fd, buf, REDIS_IOBUF_LEN);
2364 if (nread == -1) {
2365 if (errno == EAGAIN) {
2366 nread = 0;
2367 } else {
2368 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2369 freeClient(c);
2370 return;
2371 }
2372 } else if (nread == 0) {
2373 redisLog(REDIS_VERBOSE, "Client closed connection");
2374 freeClient(c);
2375 return;
2376 }
2377 if (nread) {
2378 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2379 c->lastinteraction = time(NULL);
2380 } else {
2381 return;
2382 }
2383 if (!(c->flags & REDIS_BLOCKED))
2384 processInputBuffer(c);
2385 }
2386
2387 static int selectDb(redisClient *c, int id) {
2388 if (id < 0 || id >= server.dbnum)
2389 return REDIS_ERR;
2390 c->db = &server.db[id];
2391 return REDIS_OK;
2392 }
2393
2394 static void *dupClientReplyValue(void *o) {
2395 incrRefCount((robj*)o);
2396 return o;
2397 }
2398
2399 static redisClient *createClient(int fd) {
2400 redisClient *c = zmalloc(sizeof(*c));
2401
2402 anetNonBlock(NULL,fd);
2403 anetTcpNoDelay(NULL,fd);
2404 if (!c) return NULL;
2405 selectDb(c,0);
2406 c->fd = fd;
2407 c->querybuf = sdsempty();
2408 c->argc = 0;
2409 c->argv = NULL;
2410 c->bulklen = -1;
2411 c->multibulk = 0;
2412 c->mbargc = 0;
2413 c->mbargv = NULL;
2414 c->sentlen = 0;
2415 c->flags = 0;
2416 c->lastinteraction = time(NULL);
2417 c->authenticated = 0;
2418 c->replstate = REDIS_REPL_NONE;
2419 c->reply = listCreate();
2420 listSetFreeMethod(c->reply,decrRefCount);
2421 listSetDupMethod(c->reply,dupClientReplyValue);
2422 c->blockingkeys = NULL;
2423 c->blockingkeysnum = 0;
2424 c->io_keys = listCreate();
2425 listSetFreeMethod(c->io_keys,decrRefCount);
2426 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2427 readQueryFromClient, c) == AE_ERR) {
2428 freeClient(c);
2429 return NULL;
2430 }
2431 listAddNodeTail(server.clients,c);
2432 initClientMultiState(c);
2433 return c;
2434 }
2435
2436 static void addReply(redisClient *c, robj *obj) {
2437 if (listLength(c->reply) == 0 &&
2438 (c->replstate == REDIS_REPL_NONE ||
2439 c->replstate == REDIS_REPL_ONLINE) &&
2440 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2441 sendReplyToClient, c) == AE_ERR) return;
2442
2443 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2444 obj = dupStringObject(obj);
2445 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2446 }
2447 listAddNodeTail(c->reply,getDecodedObject(obj));
2448 }
2449
2450 static void addReplySds(redisClient *c, sds s) {
2451 robj *o = createObject(REDIS_STRING,s);
2452 addReply(c,o);
2453 decrRefCount(o);
2454 }
2455
2456 static void addReplyDouble(redisClient *c, double d) {
2457 char buf[128];
2458
2459 snprintf(buf,sizeof(buf),"%.17g",d);
2460 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2461 (unsigned long) strlen(buf),buf));
2462 }
2463
2464 static void addReplyLong(redisClient *c, long l) {
2465 char buf[128];
2466 size_t len;
2467
2468 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2469 addReplySds(c,sdsnewlen(buf,len));
2470 }
2471
2472 static void addReplyBulkLen(redisClient *c, robj *obj) {
2473 size_t len;
2474
2475 if (obj->encoding == REDIS_ENCODING_RAW) {
2476 len = sdslen(obj->ptr);
2477 } else {
2478 long n = (long)obj->ptr;
2479
2480 /* Compute how many bytes will take this integer as a radix 10 string */
2481 len = 1;
2482 if (n < 0) {
2483 len++;
2484 n = -n;
2485 }
2486 while((n = n/10) != 0) {
2487 len++;
2488 }
2489 }
2490 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2491 }
2492
2493 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2494 int cport, cfd;
2495 char cip[128];
2496 redisClient *c;
2497 REDIS_NOTUSED(el);
2498 REDIS_NOTUSED(mask);
2499 REDIS_NOTUSED(privdata);
2500
2501 cfd = anetAccept(server.neterr, fd, cip, &cport);
2502 if (cfd == AE_ERR) {
2503 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2504 return;
2505 }
2506 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2507 if ((c = createClient(cfd)) == NULL) {
2508 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2509 close(cfd); /* May be already closed, just ingore errors */
2510 return;
2511 }
2512 /* If maxclient directive is set and this is one client more... close the
2513 * connection. Note that we create the client instead to check before
2514 * for this condition, since now the socket is already set in nonblocking
2515 * mode and we can send an error for free using the Kernel I/O */
2516 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2517 char *err = "-ERR max number of clients reached\r\n";
2518
2519 /* That's a best effort error message, don't check write errors */
2520 if (write(c->fd,err,strlen(err)) == -1) {
2521 /* Nothing to do, Just to avoid the warning... */
2522 }
2523 freeClient(c);
2524 return;
2525 }
2526 server.stat_numconnections++;
2527 }
2528
2529 /* ======================= Redis objects implementation ===================== */
2530
2531 static robj *createObject(int type, void *ptr) {
2532 robj *o;
2533
2534 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2535 if (listLength(server.objfreelist)) {
2536 listNode *head = listFirst(server.objfreelist);
2537 o = listNodeValue(head);
2538 listDelNode(server.objfreelist,head);
2539 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2540 } else {
2541 if (server.vm_enabled) {
2542 pthread_mutex_unlock(&server.obj_freelist_mutex);
2543 o = zmalloc(sizeof(*o));
2544 } else {
2545 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2546 }
2547 }
2548 o->type = type;
2549 o->encoding = REDIS_ENCODING_RAW;
2550 o->ptr = ptr;
2551 o->refcount = 1;
2552 if (server.vm_enabled) {
2553 /* Note that this code may run in the context of an I/O thread
2554 * and accessing to server.unixtime in theory is an error
2555 * (no locks). But in practice this is safe, and even if we read
2556 * garbage Redis will not fail, as it's just a statistical info */
2557 o->vm.atime = server.unixtime;
2558 o->storage = REDIS_VM_MEMORY;
2559 }
2560 return o;
2561 }
2562
2563 static robj *createStringObject(char *ptr, size_t len) {
2564 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2565 }
2566
2567 static robj *dupStringObject(robj *o) {
2568 assert(o->encoding == REDIS_ENCODING_RAW);
2569 return createStringObject(o->ptr,sdslen(o->ptr));
2570 }
2571
2572 static robj *createListObject(void) {
2573 list *l = listCreate();
2574
2575 listSetFreeMethod(l,decrRefCount);
2576 return createObject(REDIS_LIST,l);
2577 }
2578
2579 static robj *createSetObject(void) {
2580 dict *d = dictCreate(&setDictType,NULL);
2581 return createObject(REDIS_SET,d);
2582 }
2583
2584 static robj *createHashObject(void) {
2585 /* All the Hashes start as zipmaps. Will be automatically converted
2586 * into hash tables if there are enough elements or big elements
2587 * inside. */
2588 unsigned char *zm = zipmapNew();
2589 robj *o = createObject(REDIS_HASH,zm);
2590 o->encoding = REDIS_ENCODING_ZIPMAP;
2591 return o;
2592 }
2593
2594 static robj *createZsetObject(void) {
2595 zset *zs = zmalloc(sizeof(*zs));
2596
2597 zs->dict = dictCreate(&zsetDictType,NULL);
2598 zs->zsl = zslCreate();
2599 return createObject(REDIS_ZSET,zs);
2600 }
2601
2602 static void freeStringObject(robj *o) {
2603 if (o->encoding == REDIS_ENCODING_RAW) {
2604 sdsfree(o->ptr);
2605 }
2606 }
2607
2608 static void freeListObject(robj *o) {
2609 listRelease((list*) o->ptr);
2610 }
2611
2612 static void freeSetObject(robj *o) {
2613 dictRelease((dict*) o->ptr);
2614 }
2615
2616 static void freeZsetObject(robj *o) {
2617 zset *zs = o->ptr;
2618
2619 dictRelease(zs->dict);
2620 zslFree(zs->zsl);
2621 zfree(zs);
2622 }
2623
2624 static void freeHashObject(robj *o) {
2625 switch (o->encoding) {
2626 case REDIS_ENCODING_HT:
2627 dictRelease((dict*) o->ptr);
2628 break;
2629 case REDIS_ENCODING_ZIPMAP:
2630 zfree(o->ptr);
2631 break;
2632 default:
2633 redisAssert(0);
2634 break;
2635 }
2636 }
2637
2638 static void incrRefCount(robj *o) {
2639 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2640 o->refcount++;
2641 }
2642
2643 static void decrRefCount(void *obj) {
2644 robj *o = obj;
2645
2646 /* Object is a key of a swapped out value, or in the process of being
2647 * loaded. */
2648 if (server.vm_enabled &&
2649 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2650 {
2651 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2652 redisAssert(o->refcount == 1);
2653 }
2654 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2655 redisAssert(o->type == REDIS_STRING);
2656 freeStringObject(o);
2657 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2658 pthread_mutex_lock(&server.obj_freelist_mutex);
2659 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2660 !listAddNodeHead(server.objfreelist,o))
2661 zfree(o);
2662 pthread_mutex_unlock(&server.obj_freelist_mutex);
2663 server.vm_stats_swapped_objects--;
2664 return;
2665 }
2666 /* Object is in memory, or in the process of being swapped out. */
2667 if (--(o->refcount) == 0) {
2668 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2669 vmCancelThreadedIOJob(obj);
2670 switch(o->type) {
2671 case REDIS_STRING: freeStringObject(o); break;
2672 case REDIS_LIST: freeListObject(o); break;
2673 case REDIS_SET: freeSetObject(o); break;
2674 case REDIS_ZSET: freeZsetObject(o); break;
2675 case REDIS_HASH: freeHashObject(o); break;
2676 default: redisAssert(0 != 0); break;
2677 }
2678 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2679 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2680 !listAddNodeHead(server.objfreelist,o))
2681 zfree(o);
2682 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2683 }
2684 }
2685
2686 static robj *lookupKey(redisDb *db, robj *key) {
2687 dictEntry *de = dictFind(db->dict,key);
2688 if (de) {
2689 robj *key = dictGetEntryKey(de);
2690 robj *val = dictGetEntryVal(de);
2691
2692 if (server.vm_enabled) {
2693 if (key->storage == REDIS_VM_MEMORY ||
2694 key->storage == REDIS_VM_SWAPPING)
2695 {
2696 /* If we were swapping the object out, stop it, this key
2697 * was requested. */
2698 if (key->storage == REDIS_VM_SWAPPING)
2699 vmCancelThreadedIOJob(key);
2700 /* Update the access time of the key for the aging algorithm. */
2701 key->vm.atime = server.unixtime;
2702 } else {
2703 int notify = (key->storage == REDIS_VM_LOADING);
2704
2705 /* Our value was swapped on disk. Bring it at home. */
2706 redisAssert(val == NULL);
2707 val = vmLoadObject(key);
2708 dictGetEntryVal(de) = val;
2709
2710 /* Clients blocked by the VM subsystem may be waiting for
2711 * this key... */
2712 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2713 }
2714 }
2715 return val;
2716 } else {
2717 return NULL;
2718 }
2719 }
2720
2721 static robj *lookupKeyRead(redisDb *db, robj *key) {
2722 expireIfNeeded(db,key);
2723 return lookupKey(db,key);
2724 }
2725
2726 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2727 deleteIfVolatile(db,key);
2728 return lookupKey(db,key);
2729 }
2730
2731 static int deleteKey(redisDb *db, robj *key) {
2732 int retval;
2733
2734 /* We need to protect key from destruction: after the first dictDelete()
2735 * it may happen that 'key' is no longer valid if we don't increment
2736 * it's count. This may happen when we get the object reference directly
2737 * from the hash table with dictRandomKey() or dict iterators */
2738 incrRefCount(key);
2739 if (dictSize(db->expires)) dictDelete(db->expires,key);
2740 retval = dictDelete(db->dict,key);
2741 decrRefCount(key);
2742
2743 return retval == DICT_OK;
2744 }
2745
2746 /* Try to share an object against the shared objects pool */
2747 static robj *tryObjectSharing(robj *o) {
2748 struct dictEntry *de;
2749 unsigned long c;
2750
2751 if (o == NULL || server.shareobjects == 0) return o;
2752
2753 redisAssert(o->type == REDIS_STRING);
2754 de = dictFind(server.sharingpool,o);
2755 if (de) {
2756 robj *shared = dictGetEntryKey(de);
2757
2758 c = ((unsigned long) dictGetEntryVal(de))+1;
2759 dictGetEntryVal(de) = (void*) c;
2760 incrRefCount(shared);
2761 decrRefCount(o);
2762 return shared;
2763 } else {
2764 /* Here we are using a stream algorihtm: Every time an object is
2765 * shared we increment its count, everytime there is a miss we
2766 * recrement the counter of a random object. If this object reaches
2767 * zero we remove the object and put the current object instead. */
2768 if (dictSize(server.sharingpool) >=
2769 server.sharingpoolsize) {
2770 de = dictGetRandomKey(server.sharingpool);
2771 redisAssert(de != NULL);
2772 c = ((unsigned long) dictGetEntryVal(de))-1;
2773 dictGetEntryVal(de) = (void*) c;
2774 if (c == 0) {
2775 dictDelete(server.sharingpool,de->key);
2776 }
2777 } else {
2778 c = 0; /* If the pool is empty we want to add this object */
2779 }
2780 if (c == 0) {
2781 int retval;
2782
2783 retval = dictAdd(server.sharingpool,o,(void*)1);
2784 redisAssert(retval == DICT_OK);
2785 incrRefCount(o);
2786 }
2787 return o;
2788 }
2789 }
2790
2791 /* Check if the nul-terminated string 's' can be represented by a long
2792 * (that is, is a number that fits into long without any other space or
2793 * character before or after the digits).
2794 *
2795 * If so, the function returns REDIS_OK and *longval is set to the value
2796 * of the number. Otherwise REDIS_ERR is returned */
2797 static int isStringRepresentableAsLong(sds s, long *longval) {
2798 char buf[32], *endptr;
2799 long value;
2800 int slen;
2801
2802 value = strtol(s, &endptr, 10);
2803 if (endptr[0] != '\0') return REDIS_ERR;
2804 slen = snprintf(buf,32,"%ld",value);
2805
2806 /* If the number converted back into a string is not identical
2807 * then it's not possible to encode the string as integer */
2808 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2809 if (longval) *longval = value;
2810 return REDIS_OK;
2811 }
2812
2813 /* Try to encode a string object in order to save space */
2814 static int tryObjectEncoding(robj *o) {
2815 long value;
2816 sds s = o->ptr;
2817
2818 if (o->encoding != REDIS_ENCODING_RAW)
2819 return REDIS_ERR; /* Already encoded */
2820
2821 /* It's not save to encode shared objects: shared objects can be shared
2822 * everywhere in the "object space" of Redis. Encoded objects can only
2823 * appear as "values" (and not, for instance, as keys) */
2824 if (o->refcount > 1) return REDIS_ERR;
2825
2826 /* Currently we try to encode only strings */
2827 redisAssert(o->type == REDIS_STRING);
2828
2829 /* Check if we can represent this string as a long integer */
2830 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2831
2832 /* Ok, this object can be encoded */
2833 o->encoding = REDIS_ENCODING_INT;
2834 sdsfree(o->ptr);
2835 o->ptr = (void*) value;
2836 return REDIS_OK;
2837 }
2838
2839 /* Get a decoded version of an encoded object (returned as a new object).
2840 * If the object is already raw-encoded just increment the ref count. */
2841 static robj *getDecodedObject(robj *o) {
2842 robj *dec;
2843
2844 if (o->encoding == REDIS_ENCODING_RAW) {
2845 incrRefCount(o);
2846 return o;
2847 }
2848 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2849 char buf[32];
2850
2851 snprintf(buf,32,"%ld",(long)o->ptr);
2852 dec = createStringObject(buf,strlen(buf));
2853 return dec;
2854 } else {
2855 redisAssert(1 != 1);
2856 }
2857 }
2858
2859 /* Compare two string objects via strcmp() or alike.
2860 * Note that the objects may be integer-encoded. In such a case we
2861 * use snprintf() to get a string representation of the numbers on the stack
2862 * and compare the strings, it's much faster than calling getDecodedObject().
2863 *
2864 * Important note: if objects are not integer encoded, but binary-safe strings,
2865 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2866 * binary safe. */
2867 static int compareStringObjects(robj *a, robj *b) {
2868 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2869 char bufa[128], bufb[128], *astr, *bstr;
2870 int bothsds = 1;
2871
2872 if (a == b) return 0;
2873 if (a->encoding != REDIS_ENCODING_RAW) {
2874 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2875 astr = bufa;
2876 bothsds = 0;
2877 } else {
2878 astr = a->ptr;
2879 }
2880 if (b->encoding != REDIS_ENCODING_RAW) {
2881 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2882 bstr = bufb;
2883 bothsds = 0;
2884 } else {
2885 bstr = b->ptr;
2886 }
2887 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
2888 }
2889
2890 static size_t stringObjectLen(robj *o) {
2891 redisAssert(o->type == REDIS_STRING);
2892 if (o->encoding == REDIS_ENCODING_RAW) {
2893 return sdslen(o->ptr);
2894 } else {
2895 char buf[32];
2896
2897 return snprintf(buf,32,"%ld",(long)o->ptr);
2898 }
2899 }
2900
2901 /*============================ RDB saving/loading =========================== */
2902
2903 static int rdbSaveType(FILE *fp, unsigned char type) {
2904 if (fwrite(&type,1,1,fp) == 0) return -1;
2905 return 0;
2906 }
2907
2908 static int rdbSaveTime(FILE *fp, time_t t) {
2909 int32_t t32 = (int32_t) t;
2910 if (fwrite(&t32,4,1,fp) == 0) return -1;
2911 return 0;
2912 }
2913
2914 /* check rdbLoadLen() comments for more info */
2915 static int rdbSaveLen(FILE *fp, uint32_t len) {
2916 unsigned char buf[2];
2917
2918 if (len < (1<<6)) {
2919 /* Save a 6 bit len */
2920 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
2921 if (fwrite(buf,1,1,fp) == 0) return -1;
2922 } else if (len < (1<<14)) {
2923 /* Save a 14 bit len */
2924 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
2925 buf[1] = len&0xFF;
2926 if (fwrite(buf,2,1,fp) == 0) return -1;
2927 } else {
2928 /* Save a 32 bit len */
2929 buf[0] = (REDIS_RDB_32BITLEN<<6);
2930 if (fwrite(buf,1,1,fp) == 0) return -1;
2931 len = htonl(len);
2932 if (fwrite(&len,4,1,fp) == 0) return -1;
2933 }
2934 return 0;
2935 }
2936
2937 /* String objects in the form "2391" "-100" without any space and with a
2938 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2939 * encoded as integers to save space */
2940 static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
2941 long long value;
2942 char *endptr, buf[32];
2943
2944 /* Check if it's possible to encode this value as a number */
2945 value = strtoll(s, &endptr, 10);
2946 if (endptr[0] != '\0') return 0;
2947 snprintf(buf,32,"%lld",value);
2948
2949 /* If the number converted back into a string is not identical
2950 * then it's not possible to encode the string as integer */
2951 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2952
2953 /* Finally check if it fits in our ranges */
2954 if (value >= -(1<<7) && value <= (1<<7)-1) {
2955 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2956 enc[1] = value&0xFF;
2957 return 2;
2958 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2959 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2960 enc[1] = value&0xFF;
2961 enc[2] = (value>>8)&0xFF;
2962 return 3;
2963 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2964 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2965 enc[1] = value&0xFF;
2966 enc[2] = (value>>8)&0xFF;
2967 enc[3] = (value>>16)&0xFF;
2968 enc[4] = (value>>24)&0xFF;
2969 return 5;
2970 } else {
2971 return 0;
2972 }
2973 }
2974
2975 static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2976 unsigned int comprlen, outlen;
2977 unsigned char byte;
2978 void *out;
2979
2980 /* We require at least four bytes compression for this to be worth it */
2981 outlen = sdslen(obj->ptr)-4;
2982 if (outlen <= 0) return 0;
2983 if ((out = zmalloc(outlen+1)) == NULL) return 0;
2984 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2985 if (comprlen == 0) {
2986 zfree(out);
2987 return 0;
2988 }
2989 /* Data compressed! Let's save it on disk */
2990 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2991 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2992 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2993 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2994 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
2995 zfree(out);
2996 return comprlen;
2997
2998 writeerr:
2999 zfree(out);
3000 return -1;
3001 }
3002
3003 /* Save a string objet as [len][data] on disk. If the object is a string
3004 * representation of an integer value we try to safe it in a special form */
3005 static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
3006 size_t len;
3007 int enclen;
3008
3009 len = sdslen(obj->ptr);
3010
3011 /* Try integer encoding */
3012 if (len <= 11) {
3013 unsigned char buf[5];
3014 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
3015 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3016 return 0;
3017 }
3018 }
3019
3020 /* Try LZF compression - under 20 bytes it's unable to compress even
3021 * aaaaaaaaaaaaaaaaaa so skip it */
3022 if (server.rdbcompression && len > 20) {
3023 int retval;
3024
3025 retval = rdbSaveLzfStringObject(fp,obj);
3026 if (retval == -1) return -1;
3027 if (retval > 0) return 0;
3028 /* retval == 0 means data can't be compressed, save the old way */
3029 }
3030
3031 /* Store verbatim */
3032 if (rdbSaveLen(fp,len) == -1) return -1;
3033 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
3034 return 0;
3035 }
3036
3037 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3038 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3039 int retval;
3040
3041 /* Avoid incr/decr ref count business when possible.
3042 * This plays well with copy-on-write given that we are probably
3043 * in a child process (BGSAVE). Also this makes sure key objects
3044 * of swapped objects are not incRefCount-ed (an assert does not allow
3045 * this in order to avoid bugs) */
3046 if (obj->encoding != REDIS_ENCODING_RAW) {
3047 obj = getDecodedObject(obj);
3048 retval = rdbSaveStringObjectRaw(fp,obj);
3049 decrRefCount(obj);
3050 } else {
3051 retval = rdbSaveStringObjectRaw(fp,obj);
3052 }
3053 return retval;
3054 }
3055
3056 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3057 * 8 bit integer specifing the length of the representation.
3058 * This 8 bit integer has special values in order to specify the following
3059 * conditions:
3060 * 253: not a number
3061 * 254: + inf
3062 * 255: - inf
3063 */
3064 static int rdbSaveDoubleValue(FILE *fp, double val) {
3065 unsigned char buf[128];
3066 int len;
3067
3068 if (isnan(val)) {
3069 buf[0] = 253;
3070 len = 1;
3071 } else if (!isfinite(val)) {
3072 len = 1;
3073 buf[0] = (val < 0) ? 255 : 254;
3074 } else {
3075 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3076 buf[0] = strlen((char*)buf+1);
3077 len = buf[0]+1;
3078 }
3079 if (fwrite(buf,len,1,fp) == 0) return -1;
3080 return 0;
3081 }
3082
3083 /* Save a Redis object. */
3084 static int rdbSaveObject(FILE *fp, robj *o) {
3085 if (o->type == REDIS_STRING) {
3086 /* Save a string value */
3087 if (rdbSaveStringObject(fp,o) == -1) return -1;
3088 } else if (o->type == REDIS_LIST) {
3089 /* Save a list value */
3090 list *list = o->ptr;
3091 listIter li;
3092 listNode *ln;
3093
3094 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3095 listRewind(list,&li);
3096 while((ln = listNext(&li))) {
3097 robj *eleobj = listNodeValue(ln);
3098
3099 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3100 }
3101 } else if (o->type == REDIS_SET) {
3102 /* Save a set value */
3103 dict *set = o->ptr;
3104 dictIterator *di = dictGetIterator(set);
3105 dictEntry *de;
3106
3107 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3108 while((de = dictNext(di)) != NULL) {
3109 robj *eleobj = dictGetEntryKey(de);
3110
3111 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3112 }
3113 dictReleaseIterator(di);
3114 } else if (o->type == REDIS_ZSET) {
3115 /* Save a set value */
3116 zset *zs = o->ptr;
3117 dictIterator *di = dictGetIterator(zs->dict);
3118 dictEntry *de;
3119
3120 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3121 while((de = dictNext(di)) != NULL) {
3122 robj *eleobj = dictGetEntryKey(de);
3123 double *score = dictGetEntryVal(de);
3124
3125 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3126 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3127 }
3128 dictReleaseIterator(di);
3129 } else {
3130 redisAssert(0 != 0);
3131 }
3132 return 0;
3133 }
3134
3135 /* Return the length the object will have on disk if saved with
3136 * the rdbSaveObject() function. Currently we use a trick to get
3137 * this length with very little changes to the code. In the future
3138 * we could switch to a faster solution. */
3139 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3140 if (fp == NULL) fp = server.devnull;
3141 rewind(fp);
3142 assert(rdbSaveObject(fp,o) != 1);
3143 return ftello(fp);
3144 }
3145
3146 /* Return the number of pages required to save this object in the swap file */
3147 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3148 off_t bytes = rdbSavedObjectLen(o,fp);
3149
3150 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3151 }
3152
3153 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3154 static int rdbSave(char *filename) {
3155 dictIterator *di = NULL;
3156 dictEntry *de;
3157 FILE *fp;
3158 char tmpfile[256];
3159 int j;
3160 time_t now = time(NULL);
3161
3162 /* Wait for I/O therads to terminate, just in case this is a
3163 * foreground-saving, to avoid seeking the swap file descriptor at the
3164 * same time. */
3165 if (server.vm_enabled)
3166 waitEmptyIOJobsQueue();
3167
3168 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3169 fp = fopen(tmpfile,"w");
3170 if (!fp) {
3171 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3172 return REDIS_ERR;
3173 }
3174 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3175 for (j = 0; j < server.dbnum; j++) {
3176 redisDb *db = server.db+j;
3177 dict *d = db->dict;
3178 if (dictSize(d) == 0) continue;
3179 di = dictGetIterator(d);
3180 if (!di) {
3181 fclose(fp);
3182 return REDIS_ERR;
3183 }
3184
3185 /* Write the SELECT DB opcode */
3186 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3187 if (rdbSaveLen(fp,j) == -1) goto werr;
3188
3189 /* Iterate this DB writing every entry */
3190 while((de = dictNext(di)) != NULL) {
3191 robj *key = dictGetEntryKey(de);
3192 robj *o = dictGetEntryVal(de);
3193 time_t expiretime = getExpire(db,key);
3194
3195 /* Save the expire time */
3196 if (expiretime != -1) {
3197 /* If this key is already expired skip it */
3198 if (expiretime < now) continue;
3199 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3200 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3201 }
3202 /* Save the key and associated value. This requires special
3203 * handling if the value is swapped out. */
3204 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3205 key->storage == REDIS_VM_SWAPPING) {
3206 /* Save type, key, value */
3207 if (rdbSaveType(fp,o->type) == -1) goto werr;
3208 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3209 if (rdbSaveObject(fp,o) == -1) goto werr;
3210 } else {
3211 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3212 robj *po;
3213 /* Get a preview of the object in memory */
3214 po = vmPreviewObject(key);
3215 /* Save type, key, value */
3216 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3217 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3218 if (rdbSaveObject(fp,po) == -1) goto werr;
3219 /* Remove the loaded object from memory */
3220 decrRefCount(po);
3221 }
3222 }
3223 dictReleaseIterator(di);
3224 }
3225 /* EOF opcode */
3226 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3227
3228 /* Make sure data will not remain on the OS's output buffers */
3229 fflush(fp);
3230 fsync(fileno(fp));
3231 fclose(fp);
3232
3233 /* Use RENAME to make sure the DB file is changed atomically only
3234 * if the generate DB file is ok. */
3235 if (rename(tmpfile,filename) == -1) {
3236 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3237 unlink(tmpfile);
3238 return REDIS_ERR;
3239 }
3240 redisLog(REDIS_NOTICE,"DB saved on disk");
3241 server.dirty = 0;
3242 server.lastsave = time(NULL);
3243 return REDIS_OK;
3244
3245 werr:
3246 fclose(fp);
3247 unlink(tmpfile);
3248 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3249 if (di) dictReleaseIterator(di);
3250 return REDIS_ERR;
3251 }
3252
3253 static int rdbSaveBackground(char *filename) {
3254 pid_t childpid;
3255
3256 if (server.bgsavechildpid != -1) return REDIS_ERR;
3257 if (server.vm_enabled) waitEmptyIOJobsQueue();
3258 if ((childpid = fork()) == 0) {
3259 /* Child */
3260 if (server.vm_enabled) vmReopenSwapFile();
3261 close(server.fd);
3262 if (rdbSave(filename) == REDIS_OK) {
3263 _exit(0);
3264 } else {
3265 _exit(1);
3266 }
3267 } else {
3268 /* Parent */
3269 if (childpid == -1) {
3270 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3271 strerror(errno));
3272 return REDIS_ERR;
3273 }
3274 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3275 server.bgsavechildpid = childpid;
3276 return REDIS_OK;
3277 }
3278 return REDIS_OK; /* unreached */
3279 }
3280
3281 static void rdbRemoveTempFile(pid_t childpid) {
3282 char tmpfile[256];
3283
3284 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3285 unlink(tmpfile);
3286 }
3287
3288 static int rdbLoadType(FILE *fp) {
3289 unsigned char type;
3290 if (fread(&type,1,1,fp) == 0) return -1;
3291 return type;
3292 }
3293
3294 static time_t rdbLoadTime(FILE *fp) {
3295 int32_t t32;
3296 if (fread(&t32,4,1,fp) == 0) return -1;
3297 return (time_t) t32;
3298 }
3299
3300 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3301 * of this file for a description of how this are stored on disk.
3302 *
3303 * isencoded is set to 1 if the readed length is not actually a length but
3304 * an "encoding type", check the above comments for more info */
3305 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3306 unsigned char buf[2];
3307 uint32_t len;
3308 int type;
3309
3310 if (isencoded) *isencoded = 0;
3311 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3312 type = (buf[0]&0xC0)>>6;
3313 if (type == REDIS_RDB_6BITLEN) {
3314 /* Read a 6 bit len */
3315 return buf[0]&0x3F;
3316 } else if (type == REDIS_RDB_ENCVAL) {
3317 /* Read a 6 bit len encoding type */
3318 if (isencoded) *isencoded = 1;
3319 return buf[0]&0x3F;
3320 } else if (type == REDIS_RDB_14BITLEN) {
3321 /* Read a 14 bit len */
3322 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3323 return ((buf[0]&0x3F)<<8)|buf[1];
3324 } else {
3325 /* Read a 32 bit len */
3326 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3327 return ntohl(len);
3328 }
3329 }
3330
3331 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3332 unsigned char enc[4];
3333 long long val;
3334
3335 if (enctype == REDIS_RDB_ENC_INT8) {
3336 if (fread(enc,1,1,fp) == 0) return NULL;
3337 val = (signed char)enc[0];
3338 } else if (enctype == REDIS_RDB_ENC_INT16) {
3339 uint16_t v;
3340 if (fread(enc,2,1,fp) == 0) return NULL;
3341 v = enc[0]|(enc[1]<<8);
3342 val = (int16_t)v;
3343 } else if (enctype == REDIS_RDB_ENC_INT32) {
3344 uint32_t v;
3345 if (fread(enc,4,1,fp) == 0) return NULL;
3346 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3347 val = (int32_t)v;
3348 } else {
3349 val = 0; /* anti-warning */
3350 redisAssert(0!=0);
3351 }
3352 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3353 }
3354
3355 static robj *rdbLoadLzfStringObject(FILE*fp) {
3356 unsigned int len, clen;
3357 unsigned char *c = NULL;
3358 sds val = NULL;
3359
3360 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3361 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3362 if ((c = zmalloc(clen)) == NULL) goto err;
3363 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3364 if (fread(c,clen,1,fp) == 0) goto err;
3365 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3366 zfree(c);
3367 return createObject(REDIS_STRING,val);
3368 err:
3369 zfree(c);
3370 sdsfree(val);
3371 return NULL;
3372 }
3373
3374 static robj *rdbLoadStringObject(FILE*fp) {
3375 int isencoded;
3376 uint32_t len;
3377 sds val;
3378
3379 len = rdbLoadLen(fp,&isencoded);
3380 if (isencoded) {
3381 switch(len) {
3382 case REDIS_RDB_ENC_INT8:
3383 case REDIS_RDB_ENC_INT16:
3384 case REDIS_RDB_ENC_INT32:
3385 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3386 case REDIS_RDB_ENC_LZF:
3387 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3388 default:
3389 redisAssert(0!=0);
3390 }
3391 }
3392
3393 if (len == REDIS_RDB_LENERR) return NULL;
3394 val = sdsnewlen(NULL,len);
3395 if (len && fread(val,len,1,fp) == 0) {
3396 sdsfree(val);
3397 return NULL;
3398 }
3399 return tryObjectSharing(createObject(REDIS_STRING,val));
3400 }
3401
3402 /* For information about double serialization check rdbSaveDoubleValue() */
3403 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3404 char buf[128];
3405 unsigned char len;
3406
3407 if (fread(&len,1,1,fp) == 0) return -1;
3408 switch(len) {
3409 case 255: *val = R_NegInf; return 0;
3410 case 254: *val = R_PosInf; return 0;
3411 case 253: *val = R_Nan; return 0;
3412 default:
3413 if (fread(buf,len,1,fp) == 0) return -1;
3414 buf[len] = '\0';
3415 sscanf(buf, "%lg", val);
3416 return 0;
3417 }
3418 }
3419
3420 /* Load a Redis object of the specified type from the specified file.
3421 * On success a newly allocated object is returned, otherwise NULL. */
3422 static robj *rdbLoadObject(int type, FILE *fp) {
3423 robj *o;
3424
3425 if (type == REDIS_STRING) {
3426 /* Read string value */
3427 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3428 tryObjectEncoding(o);
3429 } else if (type == REDIS_LIST || type == REDIS_SET) {
3430 /* Read list/set value */
3431 uint32_t listlen;
3432
3433 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3434 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3435 /* It's faster to expand the dict to the right size asap in order
3436 * to avoid rehashing */
3437 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3438 dictExpand(o->ptr,listlen);
3439 /* Load every single element of the list/set */
3440 while(listlen--) {
3441 robj *ele;
3442
3443 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3444 tryObjectEncoding(ele);
3445 if (type == REDIS_LIST) {
3446 listAddNodeTail((list*)o->ptr,ele);
3447 } else {
3448 dictAdd((dict*)o->ptr,ele,NULL);
3449 }
3450 }
3451 } else if (type == REDIS_ZSET) {
3452 /* Read list/set value */
3453 uint32_t zsetlen;
3454 zset *zs;
3455
3456 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3457 o = createZsetObject();
3458 zs = o->ptr;
3459 /* Load every single element of the list/set */
3460 while(zsetlen--) {
3461 robj *ele;
3462 double *score = zmalloc(sizeof(double));
3463
3464 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3465 tryObjectEncoding(ele);
3466 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3467 dictAdd(zs->dict,ele,score);
3468 zslInsert(zs->zsl,*score,ele);
3469 incrRefCount(ele); /* added to skiplist */
3470 }
3471 } else {
3472 redisAssert(0 != 0);
3473 }
3474 return o;
3475 }
3476
3477 static int rdbLoad(char *filename) {
3478 FILE *fp;
3479 robj *keyobj = NULL;
3480 uint32_t dbid;
3481 int type, retval, rdbver;
3482 dict *d = server.db[0].dict;
3483 redisDb *db = server.db+0;
3484 char buf[1024];
3485 time_t expiretime = -1, now = time(NULL);
3486 long long loadedkeys = 0;
3487
3488 fp = fopen(filename,"r");
3489 if (!fp) return REDIS_ERR;
3490 if (fread(buf,9,1,fp) == 0) goto eoferr;
3491 buf[9] = '\0';
3492 if (memcmp(buf,"REDIS",5) != 0) {
3493 fclose(fp);
3494 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3495 return REDIS_ERR;
3496 }
3497 rdbver = atoi(buf+5);
3498 if (rdbver != 1) {
3499 fclose(fp);
3500 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3501 return REDIS_ERR;
3502 }
3503 while(1) {
3504 robj *o;
3505
3506 /* Read type. */
3507 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3508 if (type == REDIS_EXPIRETIME) {
3509 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3510 /* We read the time so we need to read the object type again */
3511 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3512 }
3513 if (type == REDIS_EOF) break;
3514 /* Handle SELECT DB opcode as a special case */
3515 if (type == REDIS_SELECTDB) {
3516 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3517 goto eoferr;
3518 if (dbid >= (unsigned)server.dbnum) {
3519 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3520 exit(1);
3521 }
3522 db = server.db+dbid;
3523 d = db->dict;
3524 continue;
3525 }
3526 /* Read key */
3527 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3528 /* Read value */
3529 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3530 /* Add the new object in the hash table */
3531 retval = dictAdd(d,keyobj,o);
3532 if (retval == DICT_ERR) {
3533 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3534 exit(1);
3535 }
3536 /* Set the expire time if needed */
3537 if (expiretime != -1) {
3538 setExpire(db,keyobj,expiretime);
3539 /* Delete this key if already expired */
3540 if (expiretime < now) deleteKey(db,keyobj);
3541 expiretime = -1;
3542 }
3543 keyobj = o = NULL;
3544 /* Handle swapping while loading big datasets when VM is on */
3545 loadedkeys++;
3546 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3547 while (zmalloc_used_memory() > server.vm_max_memory) {
3548 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3549 }
3550 }
3551 }
3552 fclose(fp);
3553 return REDIS_OK;
3554
3555 eoferr: /* unexpected end of file is handled here with a fatal exit */
3556 if (keyobj) decrRefCount(keyobj);
3557 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3558 exit(1);
3559 return REDIS_ERR; /* Just to avoid warning */
3560 }
3561
3562 /*================================== Commands =============================== */
3563
3564 static void authCommand(redisClient *c) {
3565 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3566 c->authenticated = 1;
3567 addReply(c,shared.ok);
3568 } else {
3569 c->authenticated = 0;
3570 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3571 }
3572 }
3573
3574 static void pingCommand(redisClient *c) {
3575 addReply(c,shared.pong);
3576 }
3577
3578 static void echoCommand(redisClient *c) {
3579 addReplyBulkLen(c,c->argv[1]);
3580 addReply(c,c->argv[1]);
3581 addReply(c,shared.crlf);
3582 }
3583
3584 /*=================================== Strings =============================== */
3585
3586 static void setGenericCommand(redisClient *c, int nx) {
3587 int retval;
3588
3589 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3590 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3591 if (retval == DICT_ERR) {
3592 if (!nx) {
3593 /* If the key is about a swapped value, we want a new key object
3594 * to overwrite the old. So we delete the old key in the database.
3595 * This will also make sure that swap pages about the old object
3596 * will be marked as free. */
3597 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3598 incrRefCount(c->argv[1]);
3599 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3600 incrRefCount(c->argv[2]);
3601 } else {
3602 addReply(c,shared.czero);
3603 return;
3604 }
3605 } else {
3606 incrRefCount(c->argv[1]);
3607 incrRefCount(c->argv[2]);
3608 }
3609 server.dirty++;
3610 removeExpire(c->db,c->argv[1]);
3611 addReply(c, nx ? shared.cone : shared.ok);
3612 }
3613
3614 static void setCommand(redisClient *c) {
3615 setGenericCommand(c,0);
3616 }
3617
3618 static void setnxCommand(redisClient *c) {
3619 setGenericCommand(c,1);
3620 }
3621
3622 static int getGenericCommand(redisClient *c) {
3623 robj *o = lookupKeyRead(c->db,c->argv[1]);
3624
3625 if (o == NULL) {
3626 addReply(c,shared.nullbulk);
3627 return REDIS_OK;
3628 } else {
3629 if (o->type != REDIS_STRING) {
3630 addReply(c,shared.wrongtypeerr);
3631 return REDIS_ERR;
3632 } else {
3633 addReplyBulkLen(c,o);
3634 addReply(c,o);
3635 addReply(c,shared.crlf);
3636 return REDIS_OK;
3637 }
3638 }
3639 }
3640
3641 static void getCommand(redisClient *c) {
3642 getGenericCommand(c);
3643 }
3644
3645 static void getsetCommand(redisClient *c) {
3646 if (getGenericCommand(c) == REDIS_ERR) return;
3647 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3648 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3649 } else {
3650 incrRefCount(c->argv[1]);
3651 }
3652 incrRefCount(c->argv[2]);
3653 server.dirty++;
3654 removeExpire(c->db,c->argv[1]);
3655 }
3656
3657 static void mgetCommand(redisClient *c) {
3658 int j;
3659
3660 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3661 for (j = 1; j < c->argc; j++) {
3662 robj *o = lookupKeyRead(c->db,c->argv[j]);
3663 if (o == NULL) {
3664 addReply(c,shared.nullbulk);
3665 } else {
3666 if (o->type != REDIS_STRING) {
3667 addReply(c,shared.nullbulk);
3668 } else {
3669 addReplyBulkLen(c,o);
3670 addReply(c,o);
3671 addReply(c,shared.crlf);
3672 }
3673 }
3674 }
3675 }
3676
3677 static void msetGenericCommand(redisClient *c, int nx) {
3678 int j, busykeys = 0;
3679
3680 if ((c->argc % 2) == 0) {
3681 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3682 return;
3683 }
3684 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3685 * set nothing at all if at least one already key exists. */
3686 if (nx) {
3687 for (j = 1; j < c->argc; j += 2) {
3688 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3689 busykeys++;
3690 }
3691 }
3692 }
3693 if (busykeys) {
3694 addReply(c, shared.czero);
3695 return;
3696 }
3697
3698 for (j = 1; j < c->argc; j += 2) {
3699 int retval;
3700
3701 tryObjectEncoding(c->argv[j+1]);
3702 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3703 if (retval == DICT_ERR) {
3704 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3705 incrRefCount(c->argv[j+1]);
3706 } else {
3707 incrRefCount(c->argv[j]);
3708 incrRefCount(c->argv[j+1]);
3709 }
3710 removeExpire(c->db,c->argv[j]);
3711 }
3712 server.dirty += (c->argc-1)/2;
3713 addReply(c, nx ? shared.cone : shared.ok);
3714 }
3715
3716 static void msetCommand(redisClient *c) {
3717 msetGenericCommand(c,0);
3718 }
3719
3720 static void msetnxCommand(redisClient *c) {
3721 msetGenericCommand(c,1);
3722 }
3723
3724 static void incrDecrCommand(redisClient *c, long long incr) {
3725 long long value;
3726 int retval;
3727 robj *o;
3728
3729 o = lookupKeyWrite(c->db,c->argv[1]);
3730 if (o == NULL) {
3731 value = 0;
3732 } else {
3733 if (o->type != REDIS_STRING) {
3734 value = 0;
3735 } else {
3736 char *eptr;
3737
3738 if (o->encoding == REDIS_ENCODING_RAW)
3739 value = strtoll(o->ptr, &eptr, 10);
3740 else if (o->encoding == REDIS_ENCODING_INT)
3741 value = (long)o->ptr;
3742 else
3743 redisAssert(1 != 1);
3744 }
3745 }
3746
3747 value += incr;
3748 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3749 tryObjectEncoding(o);
3750 retval = dictAdd(c->db->dict,c->argv[1],o);
3751 if (retval == DICT_ERR) {
3752 dictReplace(c->db->dict,c->argv[1],o);
3753 removeExpire(c->db,c->argv[1]);
3754 } else {
3755 incrRefCount(c->argv[1]);
3756 }
3757 server.dirty++;
3758 addReply(c,shared.colon);
3759 addReply(c,o);
3760 addReply(c,shared.crlf);
3761 }
3762
3763 static void incrCommand(redisClient *c) {
3764 incrDecrCommand(c,1);
3765 }
3766
3767 static void decrCommand(redisClient *c) {
3768 incrDecrCommand(c,-1);
3769 }
3770
3771 static void incrbyCommand(redisClient *c) {
3772 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3773 incrDecrCommand(c,incr);
3774 }
3775
3776 static void decrbyCommand(redisClient *c) {
3777 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3778 incrDecrCommand(c,-incr);
3779 }
3780
3781 static void appendCommand(redisClient *c) {
3782 int retval;
3783 size_t totlen;
3784 robj *o;
3785
3786 o = lookupKeyWrite(c->db,c->argv[1]);
3787 if (o == NULL) {
3788 /* Create the key */
3789 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3790 incrRefCount(c->argv[1]);
3791 incrRefCount(c->argv[2]);
3792 totlen = stringObjectLen(c->argv[2]);
3793 } else {
3794 dictEntry *de;
3795
3796 de = dictFind(c->db->dict,c->argv[1]);
3797 assert(de != NULL);
3798
3799 o = dictGetEntryVal(de);
3800 if (o->type != REDIS_STRING) {
3801 addReply(c,shared.wrongtypeerr);
3802 return;
3803 }
3804 /* If the object is specially encoded or shared we have to make
3805 * a copy */
3806 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3807 robj *decoded = getDecodedObject(o);
3808
3809 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3810 decrRefCount(decoded);
3811 dictReplace(c->db->dict,c->argv[1],o);
3812 }
3813 /* APPEND! */
3814 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3815 o->ptr = sdscatlen(o->ptr,
3816 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3817 } else {
3818 o->ptr = sdscatprintf(o->ptr, "%ld",
3819 (unsigned long) c->argv[2]->ptr);
3820 }
3821 totlen = sdslen(o->ptr);
3822 }
3823 server.dirty++;
3824 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3825 }
3826
3827 static void substrCommand(redisClient *c) {
3828 robj *o;
3829 long start = atoi(c->argv[2]->ptr);
3830 long end = atoi(c->argv[3]->ptr);
3831
3832 o = lookupKeyRead(c->db,c->argv[1]);
3833 if (o == NULL) {
3834 addReply(c,shared.nullbulk);
3835 } else {
3836 if (o->type != REDIS_STRING) {
3837 addReply(c,shared.wrongtypeerr);
3838 } else {
3839 size_t rangelen, strlen;
3840 sds range;
3841
3842 o = getDecodedObject(o);
3843 strlen = sdslen(o->ptr);
3844
3845 /* convert negative indexes */
3846 if (start < 0) start = strlen+start;
3847 if (end < 0) end = strlen+end;
3848 if (start < 0) start = 0;
3849 if (end < 0) end = 0;
3850
3851 /* indexes sanity checks */
3852 if (start > end || (size_t)start >= strlen) {
3853 /* Out of range start or start > end result in null reply */
3854 addReply(c,shared.nullbulk);
3855 decrRefCount(o);
3856 return;
3857 }
3858 if ((size_t)end >= strlen) end = strlen-1;
3859 rangelen = (end-start)+1;
3860
3861 /* Return the result */
3862 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",rangelen));
3863 range = sdsnewlen((char*)o->ptr+start,rangelen);
3864 addReplySds(c,range);
3865 addReply(c,shared.crlf);
3866 decrRefCount(o);
3867 }
3868 }
3869 }
3870
3871 /* ========================= Type agnostic commands ========================= */
3872
3873 static void delCommand(redisClient *c) {
3874 int deleted = 0, j;
3875
3876 for (j = 1; j < c->argc; j++) {
3877 if (deleteKey(c->db,c->argv[j])) {
3878 server.dirty++;
3879 deleted++;
3880 }
3881 }
3882 switch(deleted) {
3883 case 0:
3884 addReply(c,shared.czero);
3885 break;
3886 case 1:
3887 addReply(c,shared.cone);
3888 break;
3889 default:
3890 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3891 break;
3892 }
3893 }
3894
3895 static void existsCommand(redisClient *c) {
3896 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
3897 }
3898
3899 static void selectCommand(redisClient *c) {
3900 int id = atoi(c->argv[1]->ptr);
3901
3902 if (selectDb(c,id) == REDIS_ERR) {
3903 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
3904 } else {
3905 addReply(c,shared.ok);
3906 }
3907 }
3908
3909 static void randomkeyCommand(redisClient *c) {
3910 dictEntry *de;
3911
3912 while(1) {
3913 de = dictGetRandomKey(c->db->dict);
3914 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3915 }
3916 if (de == NULL) {
3917 addReply(c,shared.plus);
3918 addReply(c,shared.crlf);
3919 } else {
3920 addReply(c,shared.plus);
3921 addReply(c,dictGetEntryKey(de));
3922 addReply(c,shared.crlf);
3923 }
3924 }
3925
3926 static void keysCommand(redisClient *c) {
3927 dictIterator *di;
3928 dictEntry *de;
3929 sds pattern = c->argv[1]->ptr;
3930 int plen = sdslen(pattern);
3931 unsigned long numkeys = 0;
3932 robj *lenobj = createObject(REDIS_STRING,NULL);
3933
3934 di = dictGetIterator(c->db->dict);
3935 addReply(c,lenobj);
3936 decrRefCount(lenobj);
3937 while((de = dictNext(di)) != NULL) {
3938 robj *keyobj = dictGetEntryKey(de);
3939
3940 sds key = keyobj->ptr;
3941 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3942 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3943 if (expireIfNeeded(c->db,keyobj) == 0) {
3944 addReplyBulkLen(c,keyobj);
3945 addReply(c,keyobj);
3946 addReply(c,shared.crlf);
3947 numkeys++;
3948 }
3949 }
3950 }
3951 dictReleaseIterator(di);
3952 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
3953 }
3954
3955 static void dbsizeCommand(redisClient *c) {
3956 addReplySds(c,
3957 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
3958 }
3959
3960 static void lastsaveCommand(redisClient *c) {
3961 addReplySds(c,
3962 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
3963 }
3964
3965 static void typeCommand(redisClient *c) {
3966 robj *o;
3967 char *type;
3968
3969 o = lookupKeyRead(c->db,c->argv[1]);
3970 if (o == NULL) {
3971 type = "+none";
3972 } else {
3973 switch(o->type) {
3974 case REDIS_STRING: type = "+string"; break;
3975 case REDIS_LIST: type = "+list"; break;
3976 case REDIS_SET: type = "+set"; break;
3977 case REDIS_ZSET: type = "+zset"; break;
3978 default: type = "unknown"; break;
3979 }
3980 }
3981 addReplySds(c,sdsnew(type));
3982 addReply(c,shared.crlf);
3983 }
3984
3985 static void saveCommand(redisClient *c) {
3986 if (server.bgsavechildpid != -1) {
3987 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3988 return;
3989 }
3990 if (rdbSave(server.dbfilename) == REDIS_OK) {
3991 addReply(c,shared.ok);
3992 } else {
3993 addReply(c,shared.err);
3994 }
3995 }
3996
3997 static void bgsaveCommand(redisClient *c) {
3998 if (server.bgsavechildpid != -1) {
3999 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4000 return;
4001 }
4002 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4003 char *status = "+Background saving started\r\n";
4004 addReplySds(c,sdsnew(status));
4005 } else {
4006 addReply(c,shared.err);
4007 }
4008 }
4009
4010 static void shutdownCommand(redisClient *c) {
4011 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4012 /* Kill the saving child if there is a background saving in progress.
4013 We want to avoid race conditions, for instance our saving child may
4014 overwrite the synchronous saving did by SHUTDOWN. */
4015 if (server.bgsavechildpid != -1) {
4016 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4017 kill(server.bgsavechildpid,SIGKILL);
4018 rdbRemoveTempFile(server.bgsavechildpid);
4019 }
4020 if (server.appendonly) {
4021 /* Append only file: fsync() the AOF and exit */
4022 fsync(server.appendfd);
4023 if (server.vm_enabled) unlink(server.vm_swap_file);
4024 exit(0);
4025 } else {
4026 /* Snapshotting. Perform a SYNC SAVE and exit */
4027 if (rdbSave(server.dbfilename) == REDIS_OK) {
4028 if (server.daemonize)
4029 unlink(server.pidfile);
4030 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4031 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4032 if (server.vm_enabled) unlink(server.vm_swap_file);
4033 exit(0);
4034 } else {
4035 /* Ooops.. error saving! The best we can do is to continue operating.
4036 * Note that if there was a background saving process, in the next
4037 * cron() Redis will be notified that the background saving aborted,
4038 * handling special stuff like slaves pending for synchronization... */
4039 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4040 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4041 }
4042 }
4043 }
4044
4045 static void renameGenericCommand(redisClient *c, int nx) {
4046 robj *o;
4047
4048 /* To use the same key as src and dst is probably an error */
4049 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4050 addReply(c,shared.sameobjecterr);
4051 return;
4052 }
4053
4054 o = lookupKeyWrite(c->db,c->argv[1]);
4055 if (o == NULL) {
4056 addReply(c,shared.nokeyerr);
4057 return;
4058 }
4059 incrRefCount(o);
4060 deleteIfVolatile(c->db,c->argv[2]);
4061 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4062 if (nx) {
4063 decrRefCount(o);
4064 addReply(c,shared.czero);
4065 return;
4066 }
4067 dictReplace(c->db->dict,c->argv[2],o);
4068 } else {
4069 incrRefCount(c->argv[2]);
4070 }
4071 deleteKey(c->db,c->argv[1]);
4072 server.dirty++;
4073 addReply(c,nx ? shared.cone : shared.ok);
4074 }
4075
4076 static void renameCommand(redisClient *c) {
4077 renameGenericCommand(c,0);
4078 }
4079
4080 static void renamenxCommand(redisClient *c) {
4081 renameGenericCommand(c,1);
4082 }
4083
4084 static void moveCommand(redisClient *c) {
4085 robj *o;
4086 redisDb *src, *dst;
4087 int srcid;
4088
4089 /* Obtain source and target DB pointers */
4090 src = c->db;
4091 srcid = c->db->id;
4092 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4093 addReply(c,shared.outofrangeerr);
4094 return;
4095 }
4096 dst = c->db;
4097 selectDb(c,srcid); /* Back to the source DB */
4098
4099 /* If the user is moving using as target the same
4100 * DB as the source DB it is probably an error. */
4101 if (src == dst) {
4102 addReply(c,shared.sameobjecterr);
4103 return;
4104 }
4105
4106 /* Check if the element exists and get a reference */
4107 o = lookupKeyWrite(c->db,c->argv[1]);
4108 if (!o) {
4109 addReply(c,shared.czero);
4110 return;
4111 }
4112
4113 /* Try to add the element to the target DB */
4114 deleteIfVolatile(dst,c->argv[1]);
4115 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4116 addReply(c,shared.czero);
4117 return;
4118 }
4119 incrRefCount(c->argv[1]);
4120 incrRefCount(o);
4121
4122 /* OK! key moved, free the entry in the source DB */
4123 deleteKey(src,c->argv[1]);
4124 server.dirty++;
4125 addReply(c,shared.cone);
4126 }
4127
4128 /* =================================== Lists ================================ */
4129 static void pushGenericCommand(redisClient *c, int where) {
4130 robj *lobj;
4131 list *list;
4132
4133 lobj = lookupKeyWrite(c->db,c->argv[1]);
4134 if (lobj == NULL) {
4135 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4136 addReply(c,shared.cone);
4137 return;
4138 }
4139 lobj = createListObject();
4140 list = lobj->ptr;
4141 if (where == REDIS_HEAD) {
4142 listAddNodeHead(list,c->argv[2]);
4143 } else {
4144 listAddNodeTail(list,c->argv[2]);
4145 }
4146 dictAdd(c->db->dict,c->argv[1],lobj);
4147 incrRefCount(c->argv[1]);
4148 incrRefCount(c->argv[2]);
4149 } else {
4150 if (lobj->type != REDIS_LIST) {
4151 addReply(c,shared.wrongtypeerr);
4152 return;
4153 }
4154 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4155 addReply(c,shared.cone);
4156 return;
4157 }
4158 list = lobj->ptr;
4159 if (where == REDIS_HEAD) {
4160 listAddNodeHead(list,c->argv[2]);
4161 } else {
4162 listAddNodeTail(list,c->argv[2]);
4163 }
4164 incrRefCount(c->argv[2]);
4165 }
4166 server.dirty++;
4167 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4168 }
4169
4170 static void lpushCommand(redisClient *c) {
4171 pushGenericCommand(c,REDIS_HEAD);
4172 }
4173
4174 static void rpushCommand(redisClient *c) {
4175 pushGenericCommand(c,REDIS_TAIL);
4176 }
4177
4178 static void llenCommand(redisClient *c) {
4179 robj *o;
4180 list *l;
4181
4182 o = lookupKeyRead(c->db,c->argv[1]);
4183 if (o == NULL) {
4184 addReply(c,shared.czero);
4185 return;
4186 } else {
4187 if (o->type != REDIS_LIST) {
4188 addReply(c,shared.wrongtypeerr);
4189 } else {
4190 l = o->ptr;
4191 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
4192 }
4193 }
4194 }
4195
4196 static void lindexCommand(redisClient *c) {
4197 robj *o;
4198 int index = atoi(c->argv[2]->ptr);
4199
4200 o = lookupKeyRead(c->db,c->argv[1]);
4201 if (o == NULL) {
4202 addReply(c,shared.nullbulk);
4203 } else {
4204 if (o->type != REDIS_LIST) {
4205 addReply(c,shared.wrongtypeerr);
4206 } else {
4207 list *list = o->ptr;
4208 listNode *ln;
4209
4210 ln = listIndex(list, index);
4211 if (ln == NULL) {
4212 addReply(c,shared.nullbulk);
4213 } else {
4214 robj *ele = listNodeValue(ln);
4215 addReplyBulkLen(c,ele);
4216 addReply(c,ele);
4217 addReply(c,shared.crlf);
4218 }
4219 }
4220 }
4221 }
4222
4223 static void lsetCommand(redisClient *c) {
4224 robj *o;
4225 int index = atoi(c->argv[2]->ptr);
4226
4227 o = lookupKeyWrite(c->db,c->argv[1]);
4228 if (o == NULL) {
4229 addReply(c,shared.nokeyerr);
4230 } else {
4231 if (o->type != REDIS_LIST) {
4232 addReply(c,shared.wrongtypeerr);
4233 } else {
4234 list *list = o->ptr;
4235 listNode *ln;
4236
4237 ln = listIndex(list, index);
4238 if (ln == NULL) {
4239 addReply(c,shared.outofrangeerr);
4240 } else {
4241 robj *ele = listNodeValue(ln);
4242
4243 decrRefCount(ele);
4244 listNodeValue(ln) = c->argv[3];
4245 incrRefCount(c->argv[3]);
4246 addReply(c,shared.ok);
4247 server.dirty++;
4248 }
4249 }
4250 }
4251 }
4252
4253 static void popGenericCommand(redisClient *c, int where) {
4254 robj *o;
4255
4256 o = lookupKeyWrite(c->db,c->argv[1]);
4257 if (o == NULL) {
4258 addReply(c,shared.nullbulk);
4259 } else {
4260 if (o->type != REDIS_LIST) {
4261 addReply(c,shared.wrongtypeerr);
4262 } else {
4263 list *list = o->ptr;
4264 listNode *ln;
4265
4266 if (where == REDIS_HEAD)
4267 ln = listFirst(list);
4268 else
4269 ln = listLast(list);
4270
4271 if (ln == NULL) {
4272 addReply(c,shared.nullbulk);
4273 } else {
4274 robj *ele = listNodeValue(ln);
4275 addReplyBulkLen(c,ele);
4276 addReply(c,ele);
4277 addReply(c,shared.crlf);
4278 listDelNode(list,ln);
4279 server.dirty++;
4280 }
4281 }
4282 }
4283 }
4284
4285 static void lpopCommand(redisClient *c) {
4286 popGenericCommand(c,REDIS_HEAD);
4287 }
4288
4289 static void rpopCommand(redisClient *c) {
4290 popGenericCommand(c,REDIS_TAIL);
4291 }
4292
4293 static void lrangeCommand(redisClient *c) {
4294 robj *o;
4295 int start = atoi(c->argv[2]->ptr);
4296 int end = atoi(c->argv[3]->ptr);
4297
4298 o = lookupKeyRead(c->db,c->argv[1]);
4299 if (o == NULL) {
4300 addReply(c,shared.nullmultibulk);
4301 } else {
4302 if (o->type != REDIS_LIST) {
4303 addReply(c,shared.wrongtypeerr);
4304 } else {
4305 list *list = o->ptr;
4306 listNode *ln;
4307 int llen = listLength(list);
4308 int rangelen, j;
4309 robj *ele;
4310
4311 /* convert negative indexes */
4312 if (start < 0) start = llen+start;
4313 if (end < 0) end = llen+end;
4314 if (start < 0) start = 0;
4315 if (end < 0) end = 0;
4316
4317 /* indexes sanity checks */
4318 if (start > end || start >= llen) {
4319 /* Out of range start or start > end result in empty list */
4320 addReply(c,shared.emptymultibulk);
4321 return;
4322 }
4323 if (end >= llen) end = llen-1;
4324 rangelen = (end-start)+1;
4325
4326 /* Return the result in form of a multi-bulk reply */
4327 ln = listIndex(list, start);
4328 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4329 for (j = 0; j < rangelen; j++) {
4330 ele = listNodeValue(ln);
4331 addReplyBulkLen(c,ele);
4332 addReply(c,ele);
4333 addReply(c,shared.crlf);
4334 ln = ln->next;
4335 }
4336 }
4337 }
4338 }
4339
4340 static void ltrimCommand(redisClient *c) {
4341 robj *o;
4342 int start = atoi(c->argv[2]->ptr);
4343 int end = atoi(c->argv[3]->ptr);
4344
4345 o = lookupKeyWrite(c->db,c->argv[1]);
4346 if (o == NULL) {
4347 addReply(c,shared.ok);
4348 } else {
4349 if (o->type != REDIS_LIST) {
4350 addReply(c,shared.wrongtypeerr);
4351 } else {
4352 list *list = o->ptr;
4353 listNode *ln;
4354 int llen = listLength(list);
4355 int j, ltrim, rtrim;
4356
4357 /* convert negative indexes */
4358 if (start < 0) start = llen+start;
4359 if (end < 0) end = llen+end;
4360 if (start < 0) start = 0;
4361 if (end < 0) end = 0;
4362
4363 /* indexes sanity checks */
4364 if (start > end || start >= llen) {
4365 /* Out of range start or start > end result in empty list */
4366 ltrim = llen;
4367 rtrim = 0;
4368 } else {
4369 if (end >= llen) end = llen-1;
4370 ltrim = start;
4371 rtrim = llen-end-1;
4372 }
4373
4374 /* Remove list elements to perform the trim */
4375 for (j = 0; j < ltrim; j++) {
4376 ln = listFirst(list);
4377 listDelNode(list,ln);
4378 }
4379 for (j = 0; j < rtrim; j++) {
4380 ln = listLast(list);
4381 listDelNode(list,ln);
4382 }
4383 server.dirty++;
4384 addReply(c,shared.ok);
4385 }
4386 }
4387 }
4388
4389 static void lremCommand(redisClient *c) {
4390 robj *o;
4391
4392 o = lookupKeyWrite(c->db,c->argv[1]);
4393 if (o == NULL) {
4394 addReply(c,shared.czero);
4395 } else {
4396 if (o->type != REDIS_LIST) {
4397 addReply(c,shared.wrongtypeerr);
4398 } else {
4399 list *list = o->ptr;
4400 listNode *ln, *next;
4401 int toremove = atoi(c->argv[2]->ptr);
4402 int removed = 0;
4403 int fromtail = 0;
4404
4405 if (toremove < 0) {
4406 toremove = -toremove;
4407 fromtail = 1;
4408 }
4409 ln = fromtail ? list->tail : list->head;
4410 while (ln) {
4411 robj *ele = listNodeValue(ln);
4412
4413 next = fromtail ? ln->prev : ln->next;
4414 if (compareStringObjects(ele,c->argv[3]) == 0) {
4415 listDelNode(list,ln);
4416 server.dirty++;
4417 removed++;
4418 if (toremove && removed == toremove) break;
4419 }
4420 ln = next;
4421 }
4422 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4423 }
4424 }
4425 }
4426
4427 /* This is the semantic of this command:
4428 * RPOPLPUSH srclist dstlist:
4429 * IF LLEN(srclist) > 0
4430 * element = RPOP srclist
4431 * LPUSH dstlist element
4432 * RETURN element
4433 * ELSE
4434 * RETURN nil
4435 * END
4436 * END
4437 *
4438 * The idea is to be able to get an element from a list in a reliable way
4439 * since the element is not just returned but pushed against another list
4440 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4441 */
4442 static void rpoplpushcommand(redisClient *c) {
4443 robj *sobj;
4444
4445 sobj = lookupKeyWrite(c->db,c->argv[1]);
4446 if (sobj == NULL) {
4447 addReply(c,shared.nullbulk);
4448 } else {
4449 if (sobj->type != REDIS_LIST) {
4450 addReply(c,shared.wrongtypeerr);
4451 } else {
4452 list *srclist = sobj->ptr;
4453 listNode *ln = listLast(srclist);
4454
4455 if (ln == NULL) {
4456 addReply(c,shared.nullbulk);
4457 } else {
4458 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4459 robj *ele = listNodeValue(ln);
4460 list *dstlist;
4461
4462 if (dobj && dobj->type != REDIS_LIST) {
4463 addReply(c,shared.wrongtypeerr);
4464 return;
4465 }
4466
4467 /* Add the element to the target list (unless it's directly
4468 * passed to some BLPOP-ing client */
4469 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4470 if (dobj == NULL) {
4471 /* Create the list if the key does not exist */
4472 dobj = createListObject();
4473 dictAdd(c->db->dict,c->argv[2],dobj);
4474 incrRefCount(c->argv[2]);
4475 }
4476 dstlist = dobj->ptr;
4477 listAddNodeHead(dstlist,ele);
4478 incrRefCount(ele);
4479 }
4480
4481 /* Send the element to the client as reply as well */
4482 addReplyBulkLen(c,ele);
4483 addReply(c,ele);
4484 addReply(c,shared.crlf);
4485
4486 /* Finally remove the element from the source list */
4487 listDelNode(srclist,ln);
4488 server.dirty++;
4489 }
4490 }
4491 }
4492 }
4493
4494
4495 /* ==================================== Sets ================================ */
4496
4497 static void saddCommand(redisClient *c) {
4498 robj *set;
4499
4500 set = lookupKeyWrite(c->db,c->argv[1]);
4501 if (set == NULL) {
4502 set = createSetObject();
4503 dictAdd(c->db->dict,c->argv[1],set);
4504 incrRefCount(c->argv[1]);
4505 } else {
4506 if (set->type != REDIS_SET) {
4507 addReply(c,shared.wrongtypeerr);
4508 return;
4509 }
4510 }
4511 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4512 incrRefCount(c->argv[2]);
4513 server.dirty++;
4514 addReply(c,shared.cone);
4515 } else {
4516 addReply(c,shared.czero);
4517 }
4518 }
4519
4520 static void sremCommand(redisClient *c) {
4521 robj *set;
4522
4523 set = lookupKeyWrite(c->db,c->argv[1]);
4524 if (set == NULL) {
4525 addReply(c,shared.czero);
4526 } else {
4527 if (set->type != REDIS_SET) {
4528 addReply(c,shared.wrongtypeerr);
4529 return;
4530 }
4531 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4532 server.dirty++;
4533 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4534 addReply(c,shared.cone);
4535 } else {
4536 addReply(c,shared.czero);
4537 }
4538 }
4539 }
4540
4541 static void smoveCommand(redisClient *c) {
4542 robj *srcset, *dstset;
4543
4544 srcset = lookupKeyWrite(c->db,c->argv[1]);
4545 dstset = lookupKeyWrite(c->db,c->argv[2]);
4546
4547 /* If the source key does not exist return 0, if it's of the wrong type
4548 * raise an error */
4549 if (srcset == NULL || srcset->type != REDIS_SET) {
4550 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4551 return;
4552 }
4553 /* Error if the destination key is not a set as well */
4554 if (dstset && dstset->type != REDIS_SET) {
4555 addReply(c,shared.wrongtypeerr);
4556 return;
4557 }
4558 /* Remove the element from the source set */
4559 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4560 /* Key not found in the src set! return zero */
4561 addReply(c,shared.czero);
4562 return;
4563 }
4564 server.dirty++;
4565 /* Add the element to the destination set */
4566 if (!dstset) {
4567 dstset = createSetObject();
4568 dictAdd(c->db->dict,c->argv[2],dstset);
4569 incrRefCount(c->argv[2]);
4570 }
4571 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4572 incrRefCount(c->argv[3]);
4573 addReply(c,shared.cone);
4574 }
4575
4576 static void sismemberCommand(redisClient *c) {
4577 robj *set;
4578
4579 set = lookupKeyRead(c->db,c->argv[1]);
4580 if (set == NULL) {
4581 addReply(c,shared.czero);
4582 } else {
4583 if (set->type != REDIS_SET) {
4584 addReply(c,shared.wrongtypeerr);
4585 return;
4586 }
4587 if (dictFind(set->ptr,c->argv[2]))
4588 addReply(c,shared.cone);
4589 else
4590 addReply(c,shared.czero);
4591 }
4592 }
4593
4594 static void scardCommand(redisClient *c) {
4595 robj *o;
4596 dict *s;
4597
4598 o = lookupKeyRead(c->db,c->argv[1]);
4599 if (o == NULL) {
4600 addReply(c,shared.czero);
4601 return;
4602 } else {
4603 if (o->type != REDIS_SET) {
4604 addReply(c,shared.wrongtypeerr);
4605 } else {
4606 s = o->ptr;
4607 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4608 dictSize(s)));
4609 }
4610 }
4611 }
4612
4613 static void spopCommand(redisClient *c) {
4614 robj *set;
4615 dictEntry *de;
4616
4617 set = lookupKeyWrite(c->db,c->argv[1]);
4618 if (set == NULL) {
4619 addReply(c,shared.nullbulk);
4620 } else {
4621 if (set->type != REDIS_SET) {
4622 addReply(c,shared.wrongtypeerr);
4623 return;
4624 }
4625 de = dictGetRandomKey(set->ptr);
4626 if (de == NULL) {
4627 addReply(c,shared.nullbulk);
4628 } else {
4629 robj *ele = dictGetEntryKey(de);
4630
4631 addReplyBulkLen(c,ele);
4632 addReply(c,ele);
4633 addReply(c,shared.crlf);
4634 dictDelete(set->ptr,ele);
4635 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4636 server.dirty++;
4637 }
4638 }
4639 }
4640
4641 static void srandmemberCommand(redisClient *c) {
4642 robj *set;
4643 dictEntry *de;
4644
4645 set = lookupKeyRead(c->db,c->argv[1]);
4646 if (set == NULL) {
4647 addReply(c,shared.nullbulk);
4648 } else {
4649 if (set->type != REDIS_SET) {
4650 addReply(c,shared.wrongtypeerr);
4651 return;
4652 }
4653 de = dictGetRandomKey(set->ptr);
4654 if (de == NULL) {
4655 addReply(c,shared.nullbulk);
4656 } else {
4657 robj *ele = dictGetEntryKey(de);
4658
4659 addReplyBulkLen(c,ele);
4660 addReply(c,ele);
4661 addReply(c,shared.crlf);
4662 }
4663 }
4664 }
4665
4666 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4667 dict **d1 = (void*) s1, **d2 = (void*) s2;
4668
4669 return dictSize(*d1)-dictSize(*d2);
4670 }
4671
4672 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4673 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4674 dictIterator *di;
4675 dictEntry *de;
4676 robj *lenobj = NULL, *dstset = NULL;
4677 unsigned long j, cardinality = 0;
4678
4679 for (j = 0; j < setsnum; j++) {
4680 robj *setobj;
4681
4682 setobj = dstkey ?
4683 lookupKeyWrite(c->db,setskeys[j]) :
4684 lookupKeyRead(c->db,setskeys[j]);
4685 if (!setobj) {
4686 zfree(dv);
4687 if (dstkey) {
4688 if (deleteKey(c->db,dstkey))
4689 server.dirty++;
4690 addReply(c,shared.czero);
4691 } else {
4692 addReply(c,shared.nullmultibulk);
4693 }
4694 return;
4695 }
4696 if (setobj->type != REDIS_SET) {
4697 zfree(dv);
4698 addReply(c,shared.wrongtypeerr);
4699 return;
4700 }
4701 dv[j] = setobj->ptr;
4702 }
4703 /* Sort sets from the smallest to largest, this will improve our
4704 * algorithm's performace */
4705 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4706
4707 /* The first thing we should output is the total number of elements...
4708 * since this is a multi-bulk write, but at this stage we don't know
4709 * the intersection set size, so we use a trick, append an empty object
4710 * to the output list and save the pointer to later modify it with the
4711 * right length */
4712 if (!dstkey) {
4713 lenobj = createObject(REDIS_STRING,NULL);
4714 addReply(c,lenobj);
4715 decrRefCount(lenobj);
4716 } else {
4717 /* If we have a target key where to store the resulting set
4718 * create this key with an empty set inside */
4719 dstset = createSetObject();
4720 }
4721
4722 /* Iterate all the elements of the first (smallest) set, and test
4723 * the element against all the other sets, if at least one set does
4724 * not include the element it is discarded */
4725 di = dictGetIterator(dv[0]);
4726
4727 while((de = dictNext(di)) != NULL) {
4728 robj *ele;
4729
4730 for (j = 1; j < setsnum; j++)
4731 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4732 if (j != setsnum)
4733 continue; /* at least one set does not contain the member */
4734 ele = dictGetEntryKey(de);
4735 if (!dstkey) {
4736 addReplyBulkLen(c,ele);
4737 addReply(c,ele);
4738 addReply(c,shared.crlf);
4739 cardinality++;
4740 } else {
4741 dictAdd(dstset->ptr,ele,NULL);
4742 incrRefCount(ele);
4743 }
4744 }
4745 dictReleaseIterator(di);
4746
4747 if (dstkey) {
4748 /* Store the resulting set into the target */
4749 deleteKey(c->db,dstkey);
4750 dictAdd(c->db->dict,dstkey,dstset);
4751 incrRefCount(dstkey);
4752 }
4753
4754 if (!dstkey) {
4755 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4756 } else {
4757 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4758 dictSize((dict*)dstset->ptr)));
4759 server.dirty++;
4760 }
4761 zfree(dv);
4762 }
4763
4764 static void sinterCommand(redisClient *c) {
4765 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4766 }
4767
4768 static void sinterstoreCommand(redisClient *c) {
4769 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4770 }
4771
4772 #define REDIS_OP_UNION 0
4773 #define REDIS_OP_DIFF 1
4774 #define REDIS_OP_INTER 2
4775
4776 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4777 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4778 dictIterator *di;
4779 dictEntry *de;
4780 robj *dstset = NULL;
4781 int j, cardinality = 0;
4782
4783 for (j = 0; j < setsnum; j++) {
4784 robj *setobj;
4785
4786 setobj = dstkey ?
4787 lookupKeyWrite(c->db,setskeys[j]) :
4788 lookupKeyRead(c->db,setskeys[j]);
4789 if (!setobj) {
4790 dv[j] = NULL;
4791 continue;
4792 }
4793 if (setobj->type != REDIS_SET) {
4794 zfree(dv);
4795 addReply(c,shared.wrongtypeerr);
4796 return;
4797 }
4798 dv[j] = setobj->ptr;
4799 }
4800
4801 /* We need a temp set object to store our union. If the dstkey
4802 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4803 * this set object will be the resulting object to set into the target key*/
4804 dstset = createSetObject();
4805
4806 /* Iterate all the elements of all the sets, add every element a single
4807 * time to the result set */
4808 for (j = 0; j < setsnum; j++) {
4809 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4810 if (!dv[j]) continue; /* non existing keys are like empty sets */
4811
4812 di = dictGetIterator(dv[j]);
4813
4814 while((de = dictNext(di)) != NULL) {
4815 robj *ele;
4816
4817 /* dictAdd will not add the same element multiple times */
4818 ele = dictGetEntryKey(de);
4819 if (op == REDIS_OP_UNION || j == 0) {
4820 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4821 incrRefCount(ele);
4822 cardinality++;
4823 }
4824 } else if (op == REDIS_OP_DIFF) {
4825 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4826 cardinality--;
4827 }
4828 }
4829 }
4830 dictReleaseIterator(di);
4831
4832 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
4833 }
4834
4835 /* Output the content of the resulting set, if not in STORE mode */
4836 if (!dstkey) {
4837 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4838 di = dictGetIterator(dstset->ptr);
4839 while((de = dictNext(di)) != NULL) {
4840 robj *ele;
4841
4842 ele = dictGetEntryKey(de);
4843 addReplyBulkLen(c,ele);
4844 addReply(c,ele);
4845 addReply(c,shared.crlf);
4846 }
4847 dictReleaseIterator(di);
4848 } else {
4849 /* If we have a target key where to store the resulting set
4850 * create this key with the result set inside */
4851 deleteKey(c->db,dstkey);
4852 dictAdd(c->db->dict,dstkey,dstset);
4853 incrRefCount(dstkey);
4854 }
4855
4856 /* Cleanup */
4857 if (!dstkey) {
4858 decrRefCount(dstset);
4859 } else {
4860 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4861 dictSize((dict*)dstset->ptr)));
4862 server.dirty++;
4863 }
4864 zfree(dv);
4865 }
4866
4867 static void sunionCommand(redisClient *c) {
4868 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4869 }
4870
4871 static void sunionstoreCommand(redisClient *c) {
4872 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4873 }
4874
4875 static void sdiffCommand(redisClient *c) {
4876 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4877 }
4878
4879 static void sdiffstoreCommand(redisClient *c) {
4880 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4881 }
4882
4883 /* ==================================== ZSets =============================== */
4884
4885 /* ZSETs are ordered sets using two data structures to hold the same elements
4886 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4887 * data structure.
4888 *
4889 * The elements are added to an hash table mapping Redis objects to scores.
4890 * At the same time the elements are added to a skip list mapping scores
4891 * to Redis objects (so objects are sorted by scores in this "view"). */
4892
4893 /* This skiplist implementation is almost a C translation of the original
4894 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4895 * Alternative to Balanced Trees", modified in three ways:
4896 * a) this implementation allows for repeated values.
4897 * b) the comparison is not just by key (our 'score') but by satellite data.
4898 * c) there is a back pointer, so it's a doubly linked list with the back
4899 * pointers being only at "level 1". This allows to traverse the list
4900 * from tail to head, useful for ZREVRANGE. */
4901
4902 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4903 zskiplistNode *zn = zmalloc(sizeof(*zn));
4904
4905 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4906 if (level > 0)
4907 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
4908 zn->score = score;
4909 zn->obj = obj;
4910 return zn;
4911 }
4912
4913 static zskiplist *zslCreate(void) {
4914 int j;
4915 zskiplist *zsl;
4916
4917 zsl = zmalloc(sizeof(*zsl));
4918 zsl->level = 1;
4919 zsl->length = 0;
4920 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4921 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
4922 zsl->header->forward[j] = NULL;
4923
4924 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
4925 if (j < ZSKIPLIST_MAXLEVEL-1)
4926 zsl->header->span[j] = 0;
4927 }
4928 zsl->header->backward = NULL;
4929 zsl->tail = NULL;
4930 return zsl;
4931 }
4932
4933 static void zslFreeNode(zskiplistNode *node) {
4934 decrRefCount(node->obj);
4935 zfree(node->forward);
4936 zfree(node->span);
4937 zfree(node);
4938 }
4939
4940 static void zslFree(zskiplist *zsl) {
4941 zskiplistNode *node = zsl->header->forward[0], *next;
4942
4943 zfree(zsl->header->forward);
4944 zfree(zsl->header->span);
4945 zfree(zsl->header);
4946 while(node) {
4947 next = node->forward[0];
4948 zslFreeNode(node);
4949 node = next;
4950 }
4951 zfree(zsl);
4952 }
4953
4954 static int zslRandomLevel(void) {
4955 int level = 1;
4956 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4957 level += 1;
4958 return level;
4959 }
4960
4961 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4962 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4963 unsigned int rank[ZSKIPLIST_MAXLEVEL];
4964 int i, level;
4965
4966 x = zsl->header;
4967 for (i = zsl->level-1; i >= 0; i--) {
4968 /* store rank that is crossed to reach the insert position */
4969 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
4970
4971 while (x->forward[i] &&
4972 (x->forward[i]->score < score ||
4973 (x->forward[i]->score == score &&
4974 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
4975 rank[i] += i > 0 ? x->span[i-1] : 1;
4976 x = x->forward[i];
4977 }
4978 update[i] = x;
4979 }
4980 /* we assume the key is not already inside, since we allow duplicated
4981 * scores, and the re-insertion of score and redis object should never
4982 * happpen since the caller of zslInsert() should test in the hash table
4983 * if the element is already inside or not. */
4984 level = zslRandomLevel();
4985 if (level > zsl->level) {
4986 for (i = zsl->level; i < level; i++) {
4987 rank[i] = 0;
4988 update[i] = zsl->header;
4989 update[i]->span[i-1] = zsl->length;
4990 }
4991 zsl->level = level;
4992 }
4993 x = zslCreateNode(level,score,obj);
4994 for (i = 0; i < level; i++) {
4995 x->forward[i] = update[i]->forward[i];
4996 update[i]->forward[i] = x;
4997
4998 /* update span covered by update[i] as x is inserted here */
4999 if (i > 0) {
5000 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5001 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5002 }
5003 }
5004
5005 /* increment span for untouched levels */
5006 for (i = level; i < zsl->level; i++) {
5007 update[i]->span[i-1]++;
5008 }
5009
5010 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5011 if (x->forward[0])
5012 x->forward[0]->backward = x;
5013 else
5014 zsl->tail = x;
5015 zsl->length++;
5016 }
5017
5018 /* Delete an element with matching score/object from the skiplist. */
5019 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5020 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5021 int i;
5022
5023 x = zsl->header;
5024 for (i = zsl->level-1; i >= 0; i--) {
5025 while (x->forward[i] &&
5026 (x->forward[i]->score < score ||
5027 (x->forward[i]->score == score &&
5028 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5029 x = x->forward[i];
5030 update[i] = x;
5031 }
5032 /* We may have multiple elements with the same score, what we need
5033 * is to find the element with both the right score and object. */
5034 x = x->forward[0];
5035 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5036 for (i = 0; i < zsl->level; i++) {
5037 if (update[i]->forward[i] == x) {
5038 if (i > 0) {
5039 update[i]->span[i-1] += x->span[i-1] - 1;
5040 }
5041 update[i]->forward[i] = x->forward[i];
5042 } else {
5043 /* invariant: i > 0, because update[0]->forward[0]
5044 * is always equal to x */
5045 update[i]->span[i-1] -= 1;
5046 }
5047 }
5048 if (x->forward[0]) {
5049 x->forward[0]->backward = x->backward;
5050 } else {
5051 zsl->tail = x->backward;
5052 }
5053 zslFreeNode(x);
5054 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5055 zsl->level--;
5056 zsl->length--;
5057 return 1;
5058 } else {
5059 return 0; /* not found */
5060 }
5061 return 0; /* not found */
5062 }
5063
5064 /* Delete all the elements with score between min and max from the skiplist.
5065 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5066 * Note that this function takes the reference to the hash table view of the
5067 * sorted set, in order to remove the elements from the hash table too. */
5068 static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
5069 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5070 unsigned long removed = 0;
5071 int i;
5072
5073 x = zsl->header;
5074 for (i = zsl->level-1; i >= 0; i--) {
5075 while (x->forward[i] && x->forward[i]->score < min)
5076 x = x->forward[i];
5077 update[i] = x;
5078 }
5079 /* We may have multiple elements with the same score, what we need
5080 * is to find the element with both the right score and object. */
5081 x = x->forward[0];
5082 while (x && x->score <= max) {
5083 zskiplistNode *next;
5084
5085 for (i = 0; i < zsl->level; i++) {
5086 if (update[i]->forward[i] == x) {
5087 if (i > 0) {
5088 update[i]->span[i-1] += x->span[i-1] - 1;
5089 }
5090 update[i]->forward[i] = x->forward[i];
5091 } else {
5092 /* invariant: i > 0, because update[0]->forward[0]
5093 * is always equal to x */
5094 update[i]->span[i-1] -= 1;
5095 }
5096 }
5097 if (x->forward[0]) {
5098 x->forward[0]->backward = x->backward;
5099 } else {
5100 zsl->tail = x->backward;
5101 }
5102 next = x->forward[0];
5103 dictDelete(dict,x->obj);
5104 zslFreeNode(x);
5105 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5106 zsl->level--;
5107 zsl->length--;
5108 removed++;
5109 x = next;
5110 }
5111 return removed; /* not found */
5112 }
5113
5114 /* Find the first node having a score equal or greater than the specified one.
5115 * Returns NULL if there is no match. */
5116 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5117 zskiplistNode *x;
5118 int i;
5119
5120 x = zsl->header;
5121 for (i = zsl->level-1; i >= 0; i--) {
5122 while (x->forward[i] && x->forward[i]->score < score)
5123 x = x->forward[i];
5124 }
5125 /* We may have multiple elements with the same score, what we need
5126 * is to find the element with both the right score and object. */
5127 return x->forward[0];
5128 }
5129
5130 /* Find the rank for an element by both score and key.
5131 * Returns 0 when the element cannot be found, rank otherwise.
5132 * Note that the rank is 1-based due to the span of zsl->header to the
5133 * first element. */
5134 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5135 zskiplistNode *x;
5136 unsigned long rank = 0;
5137 int i;
5138
5139 x = zsl->header;
5140 for (i = zsl->level-1; i >= 0; i--) {
5141 while (x->forward[i] &&
5142 (x->forward[i]->score < score ||
5143 (x->forward[i]->score == score &&
5144 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5145 rank += i > 0 ? x->span[i-1] : 1;
5146 x = x->forward[i];
5147 }
5148
5149 /* x might be equal to zsl->header, so test if obj is non-NULL */
5150 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5151 return rank;
5152 }
5153 }
5154 return 0;
5155 }
5156
5157 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5158 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5159 zskiplistNode *x;
5160 unsigned long traversed = 0;
5161 int i;
5162
5163 x = zsl->header;
5164 for (i = zsl->level-1; i >= 0; i--) {
5165 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) <= rank) {
5166 traversed += i > 0 ? x->span[i-1] : 1;
5167 x = x->forward[i];
5168 }
5169
5170 if (traversed == rank) {
5171 return x;
5172 }
5173 }
5174 return NULL;
5175 }
5176
5177 /* The actual Z-commands implementations */
5178
5179 /* This generic command implements both ZADD and ZINCRBY.
5180 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5181 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5182 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5183 robj *zsetobj;
5184 zset *zs;
5185 double *score;
5186
5187 zsetobj = lookupKeyWrite(c->db,key);
5188 if (zsetobj == NULL) {
5189 zsetobj = createZsetObject();
5190 dictAdd(c->db->dict,key,zsetobj);
5191 incrRefCount(key);
5192 } else {
5193 if (zsetobj->type != REDIS_ZSET) {
5194 addReply(c,shared.wrongtypeerr);
5195 return;
5196 }
5197 }
5198 zs = zsetobj->ptr;
5199
5200 /* Ok now since we implement both ZADD and ZINCRBY here the code
5201 * needs to handle the two different conditions. It's all about setting
5202 * '*score', that is, the new score to set, to the right value. */
5203 score = zmalloc(sizeof(double));
5204 if (doincrement) {
5205 dictEntry *de;
5206
5207 /* Read the old score. If the element was not present starts from 0 */
5208 de = dictFind(zs->dict,ele);
5209 if (de) {
5210 double *oldscore = dictGetEntryVal(de);
5211 *score = *oldscore + scoreval;
5212 } else {
5213 *score = scoreval;
5214 }
5215 } else {
5216 *score = scoreval;
5217 }
5218
5219 /* What follows is a simple remove and re-insert operation that is common
5220 * to both ZADD and ZINCRBY... */
5221 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5222 /* case 1: New element */
5223 incrRefCount(ele); /* added to hash */
5224 zslInsert(zs->zsl,*score,ele);
5225 incrRefCount(ele); /* added to skiplist */
5226 server.dirty++;
5227 if (doincrement)
5228 addReplyDouble(c,*score);
5229 else
5230 addReply(c,shared.cone);
5231 } else {
5232 dictEntry *de;
5233 double *oldscore;
5234
5235 /* case 2: Score update operation */
5236 de = dictFind(zs->dict,ele);
5237 redisAssert(de != NULL);
5238 oldscore = dictGetEntryVal(de);
5239 if (*score != *oldscore) {
5240 int deleted;
5241
5242 /* Remove and insert the element in the skip list with new score */
5243 deleted = zslDelete(zs->zsl,*oldscore,ele);
5244 redisAssert(deleted != 0);
5245 zslInsert(zs->zsl,*score,ele);
5246 incrRefCount(ele);
5247 /* Update the score in the hash table */
5248 dictReplace(zs->dict,ele,score);
5249 server.dirty++;
5250 } else {
5251 zfree(score);
5252 }
5253 if (doincrement)
5254 addReplyDouble(c,*score);
5255 else
5256 addReply(c,shared.czero);
5257 }
5258 }
5259
5260 static void zaddCommand(redisClient *c) {
5261 double scoreval;
5262
5263 scoreval = strtod(c->argv[2]->ptr,NULL);
5264 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5265 }
5266
5267 static void zincrbyCommand(redisClient *c) {
5268 double scoreval;
5269
5270 scoreval = strtod(c->argv[2]->ptr,NULL);
5271 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5272 }
5273
5274 static void zremCommand(redisClient *c) {
5275 robj *zsetobj;
5276 zset *zs;
5277
5278 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5279 if (zsetobj == NULL) {
5280 addReply(c,shared.czero);
5281 } else {
5282 dictEntry *de;
5283 double *oldscore;
5284 int deleted;
5285
5286 if (zsetobj->type != REDIS_ZSET) {
5287 addReply(c,shared.wrongtypeerr);
5288 return;
5289 }
5290 zs = zsetobj->ptr;
5291 de = dictFind(zs->dict,c->argv[2]);
5292 if (de == NULL) {
5293 addReply(c,shared.czero);
5294 return;
5295 }
5296 /* Delete from the skiplist */
5297 oldscore = dictGetEntryVal(de);
5298 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5299 redisAssert(deleted != 0);
5300
5301 /* Delete from the hash table */
5302 dictDelete(zs->dict,c->argv[2]);
5303 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5304 server.dirty++;
5305 addReply(c,shared.cone);
5306 }
5307 }
5308
5309 static void zremrangebyscoreCommand(redisClient *c) {
5310 double min = strtod(c->argv[2]->ptr,NULL);
5311 double max = strtod(c->argv[3]->ptr,NULL);
5312 robj *zsetobj;
5313 zset *zs;
5314
5315 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5316 if (zsetobj == NULL) {
5317 addReply(c,shared.czero);
5318 } else {
5319 long deleted;
5320
5321 if (zsetobj->type != REDIS_ZSET) {
5322 addReply(c,shared.wrongtypeerr);
5323 return;
5324 }
5325 zs = zsetobj->ptr;
5326 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
5327 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5328 server.dirty += deleted;
5329 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5330 }
5331 }
5332
5333 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5334 int i, j, k, zsetnum;
5335 dict **srcdicts;
5336 double *weights;
5337 robj *dstobj;
5338 zset *dstzset;
5339 dictIterator *di;
5340 dictEntry *de;
5341
5342 /* expect zsetnum input keys to be given */
5343 zsetnum = atoi(c->argv[2]->ptr);
5344 if (zsetnum < 1) {
5345 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5346 return;
5347 }
5348
5349 /* test if the expected number of keys would overflow */
5350 if (3+zsetnum > c->argc) {
5351 addReply(c,shared.syntaxerr);
5352 return;
5353 }
5354
5355 /* read keys to be used for input */
5356 srcdicts = zmalloc(sizeof(dict*) * zsetnum);
5357 weights = zmalloc(sizeof(double) * zsetnum);
5358 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5359 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5360 if (!zsetobj) {
5361 srcdicts[i] = NULL;
5362 } else {
5363 if (zsetobj->type != REDIS_ZSET) {
5364 zfree(srcdicts);
5365 zfree(weights);
5366 addReply(c,shared.wrongtypeerr);
5367 return;
5368 }
5369 srcdicts[i] = ((zset*)zsetobj->ptr)->dict;
5370 }
5371
5372 /* default all weights to 1 */
5373 weights[i] = 1.0;
5374 }
5375
5376 /* parse optional extra arguments */
5377 if (j < c->argc) {
5378 int remaining = c->argc-j;
5379
5380 while (remaining) {
5381 if (!strcasecmp(c->argv[j]->ptr,"weights")) {
5382 j++; remaining--;
5383 if (remaining < zsetnum) {
5384 zfree(srcdicts);
5385 zfree(weights);
5386 addReplySds(c,sdsnew("-ERR not enough weights for ZUNION/ZINTER\r\n"));
5387 return;
5388 }
5389 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5390 weights[i] = strtod(c->argv[j]->ptr, NULL);
5391 }
5392 } else {
5393 zfree(srcdicts);
5394 zfree(weights);
5395 addReply(c,shared.syntaxerr);
5396 return;
5397 }
5398 }
5399 }
5400
5401 dstobj = createZsetObject();
5402 dstzset = dstobj->ptr;
5403
5404 if (op == REDIS_OP_INTER) {
5405 /* store index of smallest zset in variable j */
5406 for (i = 0, j = 0; i < zsetnum; i++) {
5407 if (!srcdicts[i] || dictSize(srcdicts[i]) == 0) {
5408 break;
5409 }
5410 if (dictSize(srcdicts[i]) < dictSize(srcdicts[j])) {
5411 j = i;
5412 }
5413 }
5414 /* skip going over all entries if at least one dict was NULL or empty */
5415 if (i == zsetnum) {
5416 /* precondition: all srcdicts are non-NULL and non-empty */
5417 di = dictGetIterator(srcdicts[j]);
5418 while((de = dictNext(di)) != NULL) {
5419 double *score = zmalloc(sizeof(double));
5420 *score = 0.0;
5421
5422 for (k = 0; k < zsetnum; k++) {
5423 dictEntry *other = (k == j) ? de : dictFind(srcdicts[k],dictGetEntryKey(de));
5424 if (other) {
5425 *score = *score + weights[k] * (*(double*)dictGetEntryVal(other));
5426 } else {
5427 break;
5428 }
5429 }
5430
5431 /* skip entry when not present in every source dict */
5432 if (k != zsetnum) {
5433 zfree(score);
5434 } else {
5435 robj *o = dictGetEntryKey(de);
5436 dictAdd(dstzset->dict,o,score);
5437 incrRefCount(o); /* added to dictionary */
5438 zslInsert(dstzset->zsl,*score,o);
5439 incrRefCount(o); /* added to skiplist */
5440 }
5441 }
5442 dictReleaseIterator(di);
5443 }
5444 } else if (op == REDIS_OP_UNION) {
5445 for (i = 0; i < zsetnum; i++) {
5446 if (!srcdicts[i]) continue;
5447
5448 di = dictGetIterator(srcdicts[i]);
5449 while((de = dictNext(di)) != NULL) {
5450 /* skip key when already processed */
5451 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5452
5453 double *score = zmalloc(sizeof(double));
5454 *score = 0.0;
5455 for (j = 0; j < zsetnum; j++) {
5456 if (!srcdicts[j]) continue;
5457
5458 dictEntry *other = (i == j) ? de : dictFind(srcdicts[j],dictGetEntryKey(de));
5459 if (other) {
5460 *score = *score + weights[j] * (*(double*)dictGetEntryVal(other));
5461 }
5462 }
5463
5464 robj *o = dictGetEntryKey(de);
5465 dictAdd(dstzset->dict,o,score);
5466 incrRefCount(o); /* added to dictionary */
5467 zslInsert(dstzset->zsl,*score,o);
5468 incrRefCount(o); /* added to skiplist */
5469 }
5470 dictReleaseIterator(di);
5471 }
5472 } else {
5473 /* unknown operator */
5474 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5475 }
5476
5477 deleteKey(c->db,dstkey);
5478 dictAdd(c->db->dict,dstkey,dstobj);
5479 incrRefCount(dstkey);
5480
5481 addReplyLong(c, dstzset->zsl->length);
5482 server.dirty++;
5483 zfree(srcdicts);
5484 zfree(weights);
5485 }
5486
5487 static void zunionCommand(redisClient *c) {
5488 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5489 }
5490
5491 static void zinterCommand(redisClient *c) {
5492 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5493 }
5494
5495 static void zrangeGenericCommand(redisClient *c, int reverse) {
5496 robj *o;
5497 int start = atoi(c->argv[2]->ptr);
5498 int end = atoi(c->argv[3]->ptr);
5499 int withscores = 0;
5500
5501 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5502 withscores = 1;
5503 } else if (c->argc >= 5) {
5504 addReply(c,shared.syntaxerr);
5505 return;
5506 }
5507
5508 o = lookupKeyRead(c->db,c->argv[1]);
5509 if (o == NULL) {
5510 addReply(c,shared.nullmultibulk);
5511 } else {
5512 if (o->type != REDIS_ZSET) {
5513 addReply(c,shared.wrongtypeerr);
5514 } else {
5515 zset *zsetobj = o->ptr;
5516 zskiplist *zsl = zsetobj->zsl;
5517 zskiplistNode *ln;
5518
5519 int llen = zsl->length;
5520 int rangelen, j;
5521 robj *ele;
5522
5523 /* convert negative indexes */
5524 if (start < 0) start = llen+start;
5525 if (end < 0) end = llen+end;
5526 if (start < 0) start = 0;
5527 if (end < 0) end = 0;
5528
5529 /* indexes sanity checks */
5530 if (start > end || start >= llen) {
5531 /* Out of range start or start > end result in empty list */
5532 addReply(c,shared.emptymultibulk);
5533 return;
5534 }
5535 if (end >= llen) end = llen-1;
5536 rangelen = (end-start)+1;
5537
5538 /* check if starting point is trivial, before searching
5539 * the element in log(N) time */
5540 if (reverse) {
5541 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen - start);
5542 } else {
5543 ln = start == 0 ? zsl->header->forward[0] : zslGetElementByRank(zsl, start + 1);
5544 }
5545
5546 /* Return the result in form of a multi-bulk reply */
5547 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5548 withscores ? (rangelen*2) : rangelen));
5549 for (j = 0; j < rangelen; j++) {
5550 ele = ln->obj;
5551 addReplyBulkLen(c,ele);
5552 addReply(c,ele);
5553 addReply(c,shared.crlf);
5554 if (withscores)
5555 addReplyDouble(c,ln->score);
5556 ln = reverse ? ln->backward : ln->forward[0];
5557 }
5558 }
5559 }
5560 }
5561
5562 static void zrangeCommand(redisClient *c) {
5563 zrangeGenericCommand(c,0);
5564 }
5565
5566 static void zrevrangeCommand(redisClient *c) {
5567 zrangeGenericCommand(c,1);
5568 }
5569
5570 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5571 * If justcount is non-zero, just the count is returned. */
5572 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5573 robj *o;
5574 double min, max;
5575 int minex = 0, maxex = 0; /* are min or max exclusive? */
5576 int offset = 0, limit = -1;
5577 int withscores = 0;
5578 int badsyntax = 0;
5579
5580 /* Parse the min-max interval. If one of the values is prefixed
5581 * by the "(" character, it's considered "open". For instance
5582 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5583 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5584 if (((char*)c->argv[2]->ptr)[0] == '(') {
5585 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5586 minex = 1;
5587 } else {
5588 min = strtod(c->argv[2]->ptr,NULL);
5589 }
5590 if (((char*)c->argv[3]->ptr)[0] == '(') {
5591 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5592 maxex = 1;
5593 } else {
5594 max = strtod(c->argv[3]->ptr,NULL);
5595 }
5596
5597 /* Parse "WITHSCORES": note that if the command was called with
5598 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5599 * enter the following paths to parse WITHSCORES and LIMIT. */
5600 if (c->argc == 5 || c->argc == 8) {
5601 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5602 withscores = 1;
5603 else
5604 badsyntax = 1;
5605 }
5606 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5607 badsyntax = 1;
5608 if (badsyntax) {
5609 addReplySds(c,
5610 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5611 return;
5612 }
5613
5614 /* Parse "LIMIT" */
5615 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5616 addReply(c,shared.syntaxerr);
5617 return;
5618 } else if (c->argc == (7 + withscores)) {
5619 offset = atoi(c->argv[5]->ptr);
5620 limit = atoi(c->argv[6]->ptr);
5621 if (offset < 0) offset = 0;
5622 }
5623
5624 /* Ok, lookup the key and get the range */
5625 o = lookupKeyRead(c->db,c->argv[1]);
5626 if (o == NULL) {
5627 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5628 } else {
5629 if (o->type != REDIS_ZSET) {
5630 addReply(c,shared.wrongtypeerr);
5631 } else {
5632 zset *zsetobj = o->ptr;
5633 zskiplist *zsl = zsetobj->zsl;
5634 zskiplistNode *ln;
5635 robj *ele, *lenobj = NULL;
5636 unsigned long rangelen = 0;
5637
5638 /* Get the first node with the score >= min, or with
5639 * score > min if 'minex' is true. */
5640 ln = zslFirstWithScore(zsl,min);
5641 while (minex && ln && ln->score == min) ln = ln->forward[0];
5642
5643 if (ln == NULL) {
5644 /* No element matching the speciifed interval */
5645 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5646 return;
5647 }
5648
5649 /* We don't know in advance how many matching elements there
5650 * are in the list, so we push this object that will represent
5651 * the multi-bulk length in the output buffer, and will "fix"
5652 * it later */
5653 if (!justcount) {
5654 lenobj = createObject(REDIS_STRING,NULL);
5655 addReply(c,lenobj);
5656 decrRefCount(lenobj);
5657 }
5658
5659 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5660 if (offset) {
5661 offset--;
5662 ln = ln->forward[0];
5663 continue;
5664 }
5665 if (limit == 0) break;
5666 if (!justcount) {
5667 ele = ln->obj;
5668 addReplyBulkLen(c,ele);
5669 addReply(c,ele);
5670 addReply(c,shared.crlf);
5671 if (withscores)
5672 addReplyDouble(c,ln->score);
5673 }
5674 ln = ln->forward[0];
5675 rangelen++;
5676 if (limit > 0) limit--;
5677 }
5678 if (justcount) {
5679 addReplyLong(c,(long)rangelen);
5680 } else {
5681 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5682 withscores ? (rangelen*2) : rangelen);
5683 }
5684 }
5685 }
5686 }
5687
5688 static void zrangebyscoreCommand(redisClient *c) {
5689 genericZrangebyscoreCommand(c,0);
5690 }
5691
5692 static void zcountCommand(redisClient *c) {
5693 genericZrangebyscoreCommand(c,1);
5694 }
5695
5696 static void zcardCommand(redisClient *c) {
5697 robj *o;
5698 zset *zs;
5699
5700 o = lookupKeyRead(c->db,c->argv[1]);
5701 if (o == NULL) {
5702 addReply(c,shared.czero);
5703 return;
5704 } else {
5705 if (o->type != REDIS_ZSET) {
5706 addReply(c,shared.wrongtypeerr);
5707 } else {
5708 zs = o->ptr;
5709 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
5710 }
5711 }
5712 }
5713
5714 static void zscoreCommand(redisClient *c) {
5715 robj *o;
5716 zset *zs;
5717
5718 o = lookupKeyRead(c->db,c->argv[1]);
5719 if (o == NULL) {
5720 addReply(c,shared.nullbulk);
5721 return;
5722 } else {
5723 if (o->type != REDIS_ZSET) {
5724 addReply(c,shared.wrongtypeerr);
5725 } else {
5726 dictEntry *de;
5727
5728 zs = o->ptr;
5729 de = dictFind(zs->dict,c->argv[2]);
5730 if (!de) {
5731 addReply(c,shared.nullbulk);
5732 } else {
5733 double *score = dictGetEntryVal(de);
5734
5735 addReplyDouble(c,*score);
5736 }
5737 }
5738 }
5739 }
5740
5741 static void zrankCommand(redisClient *c) {
5742 robj *o;
5743 o = lookupKeyRead(c->db,c->argv[1]);
5744 if (o == NULL) {
5745 addReply(c,shared.nullbulk);
5746 return;
5747 }
5748 if (o->type != REDIS_ZSET) {
5749 addReply(c,shared.wrongtypeerr);
5750 } else {
5751 zset *zs = o->ptr;
5752 zskiplist *zsl = zs->zsl;
5753 dictEntry *de;
5754 unsigned long rank;
5755
5756 de = dictFind(zs->dict,c->argv[2]);
5757 if (!de) {
5758 addReply(c,shared.nullbulk);
5759 return;
5760 }
5761
5762 double *score = dictGetEntryVal(de);
5763 rank = zslGetRank(zsl, *score, c->argv[2]);
5764 if (rank) {
5765 addReplyLong(c, rank-1);
5766 } else {
5767 addReply(c,shared.nullbulk);
5768 }
5769 }
5770 }
5771
5772 /* =================================== Hashes =============================== */
5773 static void hsetCommand(redisClient *c) {
5774 int update = 0;
5775 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5776
5777 if (o == NULL) {
5778 o = createHashObject();
5779 dictAdd(c->db->dict,c->argv[1],o);
5780 incrRefCount(c->argv[1]);
5781 } else {
5782 if (o->type != REDIS_HASH) {
5783 addReply(c,shared.wrongtypeerr);
5784 return;
5785 }
5786 }
5787 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5788 unsigned char *zm = o->ptr;
5789
5790 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5791 c->argv[3]->ptr,sdslen(c->argv[3]->ptr),&update);
5792 o->ptr = zm;
5793 } else {
5794 if (dictAdd(o->ptr,c->argv[2],c->argv[3]) == DICT_OK) {
5795 incrRefCount(c->argv[2]);
5796 } else {
5797 update = 1;
5798 }
5799 incrRefCount(c->argv[3]);
5800 }
5801 server.dirty++;
5802 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5803 }
5804
5805 static void hgetCommand(redisClient *c) {
5806 robj *o = lookupKeyRead(c->db,c->argv[1]);
5807
5808 if (o == NULL) {
5809 addReply(c,shared.nullbulk);
5810 return;
5811 } else {
5812 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5813 unsigned char *zm = o->ptr;
5814 unsigned char *val;
5815 unsigned int vlen;
5816
5817 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr), &val,&vlen)) {
5818 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
5819 addReplySds(c,sdsnewlen(val,vlen));
5820 addReply(c,shared.crlf);
5821 return;
5822 } else {
5823 addReply(c,shared.nullbulk);
5824 return;
5825 }
5826 } else {
5827 struct dictEntry *de;
5828
5829 de = dictFind(o->ptr,c->argv[2]);
5830 if (de == NULL) {
5831 addReply(c,shared.nullbulk);
5832 } else {
5833 robj *e = dictGetEntryVal(de);
5834
5835 addReplyBulkLen(c,e);
5836 addReply(c,e);
5837 addReply(c,shared.crlf);
5838 }
5839 }
5840 }
5841 }
5842
5843 /* ========================= Non type-specific commands ==================== */
5844
5845 static void flushdbCommand(redisClient *c) {
5846 server.dirty += dictSize(c->db->dict);
5847 dictEmpty(c->db->dict);
5848 dictEmpty(c->db->expires);
5849 addReply(c,shared.ok);
5850 }
5851
5852 static void flushallCommand(redisClient *c) {
5853 server.dirty += emptyDb();
5854 addReply(c,shared.ok);
5855 rdbSave(server.dbfilename);
5856 server.dirty++;
5857 }
5858
5859 static redisSortOperation *createSortOperation(int type, robj *pattern) {
5860 redisSortOperation *so = zmalloc(sizeof(*so));
5861 so->type = type;
5862 so->pattern = pattern;
5863 return so;
5864 }
5865
5866 /* Return the value associated to the key with a name obtained
5867 * substituting the first occurence of '*' in 'pattern' with 'subst' */
5868 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
5869 char *p;
5870 sds spat, ssub;
5871 robj keyobj;
5872 int prefixlen, sublen, postfixlen;
5873 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5874 struct {
5875 long len;
5876 long free;
5877 char buf[REDIS_SORTKEY_MAX+1];
5878 } keyname;
5879
5880 /* If the pattern is "#" return the substitution object itself in order
5881 * to implement the "SORT ... GET #" feature. */
5882 spat = pattern->ptr;
5883 if (spat[0] == '#' && spat[1] == '\0') {
5884 return subst;
5885 }
5886
5887 /* The substitution object may be specially encoded. If so we create
5888 * a decoded object on the fly. Otherwise getDecodedObject will just
5889 * increment the ref count, that we'll decrement later. */
5890 subst = getDecodedObject(subst);
5891
5892 ssub = subst->ptr;
5893 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5894 p = strchr(spat,'*');
5895 if (!p) {
5896 decrRefCount(subst);
5897 return NULL;
5898 }
5899
5900 prefixlen = p-spat;
5901 sublen = sdslen(ssub);
5902 postfixlen = sdslen(spat)-(prefixlen+1);
5903 memcpy(keyname.buf,spat,prefixlen);
5904 memcpy(keyname.buf+prefixlen,ssub,sublen);
5905 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5906 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5907 keyname.len = prefixlen+sublen+postfixlen;
5908
5909 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
5910 decrRefCount(subst);
5911
5912 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
5913 return lookupKeyRead(db,&keyobj);
5914 }
5915
5916 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5917 * the additional parameter is not standard but a BSD-specific we have to
5918 * pass sorting parameters via the global 'server' structure */
5919 static int sortCompare(const void *s1, const void *s2) {
5920 const redisSortObject *so1 = s1, *so2 = s2;
5921 int cmp;
5922
5923 if (!server.sort_alpha) {
5924 /* Numeric sorting. Here it's trivial as we precomputed scores */
5925 if (so1->u.score > so2->u.score) {
5926 cmp = 1;
5927 } else if (so1->u.score < so2->u.score) {
5928 cmp = -1;
5929 } else {
5930 cmp = 0;
5931 }
5932 } else {
5933 /* Alphanumeric sorting */
5934 if (server.sort_bypattern) {
5935 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5936 /* At least one compare object is NULL */
5937 if (so1->u.cmpobj == so2->u.cmpobj)
5938 cmp = 0;
5939 else if (so1->u.cmpobj == NULL)
5940 cmp = -1;
5941 else
5942 cmp = 1;
5943 } else {
5944 /* We have both the objects, use strcoll */
5945 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5946 }
5947 } else {
5948 /* Compare elements directly */
5949 robj *dec1, *dec2;
5950
5951 dec1 = getDecodedObject(so1->obj);
5952 dec2 = getDecodedObject(so2->obj);
5953 cmp = strcoll(dec1->ptr,dec2->ptr);
5954 decrRefCount(dec1);
5955 decrRefCount(dec2);
5956 }
5957 }
5958 return server.sort_desc ? -cmp : cmp;
5959 }
5960
5961 /* The SORT command is the most complex command in Redis. Warning: this code
5962 * is optimized for speed and a bit less for readability */
5963 static void sortCommand(redisClient *c) {
5964 list *operations;
5965 int outputlen = 0;
5966 int desc = 0, alpha = 0;
5967 int limit_start = 0, limit_count = -1, start, end;
5968 int j, dontsort = 0, vectorlen;
5969 int getop = 0; /* GET operation counter */
5970 robj *sortval, *sortby = NULL, *storekey = NULL;
5971 redisSortObject *vector; /* Resulting vector to sort */
5972
5973 /* Lookup the key to sort. It must be of the right types */
5974 sortval = lookupKeyRead(c->db,c->argv[1]);
5975 if (sortval == NULL) {
5976 addReply(c,shared.nullmultibulk);
5977 return;
5978 }
5979 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5980 sortval->type != REDIS_ZSET)
5981 {
5982 addReply(c,shared.wrongtypeerr);
5983 return;
5984 }
5985
5986 /* Create a list of operations to perform for every sorted element.
5987 * Operations can be GET/DEL/INCR/DECR */
5988 operations = listCreate();
5989 listSetFreeMethod(operations,zfree);
5990 j = 2;
5991
5992 /* Now we need to protect sortval incrementing its count, in the future
5993 * SORT may have options able to overwrite/delete keys during the sorting
5994 * and the sorted key itself may get destroied */
5995 incrRefCount(sortval);
5996
5997 /* The SORT command has an SQL-alike syntax, parse it */
5998 while(j < c->argc) {
5999 int leftargs = c->argc-j-1;
6000 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6001 desc = 0;
6002 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6003 desc = 1;
6004 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6005 alpha = 1;
6006 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6007 limit_start = atoi(c->argv[j+1]->ptr);
6008 limit_count = atoi(c->argv[j+2]->ptr);
6009 j+=2;
6010 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6011 storekey = c->argv[j+1];
6012 j++;
6013 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6014 sortby = c->argv[j+1];
6015 /* If the BY pattern does not contain '*', i.e. it is constant,
6016 * we don't need to sort nor to lookup the weight keys. */
6017 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6018 j++;
6019 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6020 listAddNodeTail(operations,createSortOperation(
6021 REDIS_SORT_GET,c->argv[j+1]));
6022 getop++;
6023 j++;
6024 } else {
6025 decrRefCount(sortval);
6026 listRelease(operations);
6027 addReply(c,shared.syntaxerr);
6028 return;
6029 }
6030 j++;
6031 }
6032
6033 /* Load the sorting vector with all the objects to sort */
6034 switch(sortval->type) {
6035 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6036 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6037 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6038 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6039 }
6040 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6041 j = 0;
6042
6043 if (sortval->type == REDIS_LIST) {
6044 list *list = sortval->ptr;
6045 listNode *ln;
6046 listIter li;
6047
6048 listRewind(list,&li);
6049 while((ln = listNext(&li))) {
6050 robj *ele = ln->value;
6051 vector[j].obj = ele;
6052 vector[j].u.score = 0;
6053 vector[j].u.cmpobj = NULL;
6054 j++;
6055 }
6056 } else {
6057 dict *set;
6058 dictIterator *di;
6059 dictEntry *setele;
6060
6061 if (sortval->type == REDIS_SET) {
6062 set = sortval->ptr;
6063 } else {
6064 zset *zs = sortval->ptr;
6065 set = zs->dict;
6066 }
6067
6068 di = dictGetIterator(set);
6069 while((setele = dictNext(di)) != NULL) {
6070 vector[j].obj = dictGetEntryKey(setele);
6071 vector[j].u.score = 0;
6072 vector[j].u.cmpobj = NULL;
6073 j++;
6074 }
6075 dictReleaseIterator(di);
6076 }
6077 redisAssert(j == vectorlen);
6078
6079 /* Now it's time to load the right scores in the sorting vector */
6080 if (dontsort == 0) {
6081 for (j = 0; j < vectorlen; j++) {
6082 if (sortby) {
6083 robj *byval;
6084
6085 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6086 if (!byval || byval->type != REDIS_STRING) continue;
6087 if (alpha) {
6088 vector[j].u.cmpobj = getDecodedObject(byval);
6089 } else {
6090 if (byval->encoding == REDIS_ENCODING_RAW) {
6091 vector[j].u.score = strtod(byval->ptr,NULL);
6092 } else {
6093 /* Don't need to decode the object if it's
6094 * integer-encoded (the only encoding supported) so
6095 * far. We can just cast it */
6096 if (byval->encoding == REDIS_ENCODING_INT) {
6097 vector[j].u.score = (long)byval->ptr;
6098 } else
6099 redisAssert(1 != 1);
6100 }
6101 }
6102 } else {
6103 if (!alpha) {
6104 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6105 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6106 else {
6107 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6108 vector[j].u.score = (long) vector[j].obj->ptr;
6109 else
6110 redisAssert(1 != 1);
6111 }
6112 }
6113 }
6114 }
6115 }
6116
6117 /* We are ready to sort the vector... perform a bit of sanity check
6118 * on the LIMIT option too. We'll use a partial version of quicksort. */
6119 start = (limit_start < 0) ? 0 : limit_start;
6120 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6121 if (start >= vectorlen) {
6122 start = vectorlen-1;
6123 end = vectorlen-2;
6124 }
6125 if (end >= vectorlen) end = vectorlen-1;
6126
6127 if (dontsort == 0) {
6128 server.sort_desc = desc;
6129 server.sort_alpha = alpha;
6130 server.sort_bypattern = sortby ? 1 : 0;
6131 if (sortby && (start != 0 || end != vectorlen-1))
6132 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6133 else
6134 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6135 }
6136
6137 /* Send command output to the output buffer, performing the specified
6138 * GET/DEL/INCR/DECR operations if any. */
6139 outputlen = getop ? getop*(end-start+1) : end-start+1;
6140 if (storekey == NULL) {
6141 /* STORE option not specified, sent the sorting result to client */
6142 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6143 for (j = start; j <= end; j++) {
6144 listNode *ln;
6145 listIter li;
6146
6147 if (!getop) {
6148 addReplyBulkLen(c,vector[j].obj);
6149 addReply(c,vector[j].obj);
6150 addReply(c,shared.crlf);
6151 }
6152 listRewind(operations,&li);
6153 while((ln = listNext(&li))) {
6154 redisSortOperation *sop = ln->value;
6155 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6156 vector[j].obj);
6157
6158 if (sop->type == REDIS_SORT_GET) {
6159 if (!val || val->type != REDIS_STRING) {
6160 addReply(c,shared.nullbulk);
6161 } else {
6162 addReplyBulkLen(c,val);
6163 addReply(c,val);
6164 addReply(c,shared.crlf);
6165 }
6166 } else {
6167 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6168 }
6169 }
6170 }
6171 } else {
6172 robj *listObject = createListObject();
6173 list *listPtr = (list*) listObject->ptr;
6174
6175 /* STORE option specified, set the sorting result as a List object */
6176 for (j = start; j <= end; j++) {
6177 listNode *ln;
6178 listIter li;
6179
6180 if (!getop) {
6181 listAddNodeTail(listPtr,vector[j].obj);
6182 incrRefCount(vector[j].obj);
6183 }
6184 listRewind(operations,&li);
6185 while((ln = listNext(&li))) {
6186 redisSortOperation *sop = ln->value;
6187 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6188 vector[j].obj);
6189
6190 if (sop->type == REDIS_SORT_GET) {
6191 if (!val || val->type != REDIS_STRING) {
6192 listAddNodeTail(listPtr,createStringObject("",0));
6193 } else {
6194 listAddNodeTail(listPtr,val);
6195 incrRefCount(val);
6196 }
6197 } else {
6198 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6199 }
6200 }
6201 }
6202 if (dictReplace(c->db->dict,storekey,listObject)) {
6203 incrRefCount(storekey);
6204 }
6205 /* Note: we add 1 because the DB is dirty anyway since even if the
6206 * SORT result is empty a new key is set and maybe the old content
6207 * replaced. */
6208 server.dirty += 1+outputlen;
6209 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6210 }
6211
6212 /* Cleanup */
6213 decrRefCount(sortval);
6214 listRelease(operations);
6215 for (j = 0; j < vectorlen; j++) {
6216 if (sortby && alpha && vector[j].u.cmpobj)
6217 decrRefCount(vector[j].u.cmpobj);
6218 }
6219 zfree(vector);
6220 }
6221
6222 /* Convert an amount of bytes into a human readable string in the form
6223 * of 100B, 2G, 100M, 4K, and so forth. */
6224 static void bytesToHuman(char *s, unsigned long long n) {
6225 double d;
6226
6227 if (n < 1024) {
6228 /* Bytes */
6229 sprintf(s,"%lluB",n);
6230 return;
6231 } else if (n < (1024*1024)) {
6232 d = (double)n/(1024);
6233 sprintf(s,"%.2fK",d);
6234 } else if (n < (1024LL*1024*1024)) {
6235 d = (double)n/(1024*1024);
6236 sprintf(s,"%.2fM",d);
6237 } else if (n < (1024LL*1024*1024*1024)) {
6238 d = (double)n/(1024LL*1024*1024);
6239 sprintf(s,"%.2fG",d);
6240 }
6241 }
6242
6243 /* Create the string returned by the INFO command. This is decoupled
6244 * by the INFO command itself as we need to report the same information
6245 * on memory corruption problems. */
6246 static sds genRedisInfoString(void) {
6247 sds info;
6248 time_t uptime = time(NULL)-server.stat_starttime;
6249 int j;
6250 char hmem[64];
6251
6252 bytesToHuman(hmem,zmalloc_used_memory());
6253 info = sdscatprintf(sdsempty(),
6254 "redis_version:%s\r\n"
6255 "arch_bits:%s\r\n"
6256 "multiplexing_api:%s\r\n"
6257 "process_id:%ld\r\n"
6258 "uptime_in_seconds:%ld\r\n"
6259 "uptime_in_days:%ld\r\n"
6260 "connected_clients:%d\r\n"
6261 "connected_slaves:%d\r\n"
6262 "blocked_clients:%d\r\n"
6263 "used_memory:%zu\r\n"
6264 "used_memory_human:%s\r\n"
6265 "changes_since_last_save:%lld\r\n"
6266 "bgsave_in_progress:%d\r\n"
6267 "last_save_time:%ld\r\n"
6268 "bgrewriteaof_in_progress:%d\r\n"
6269 "total_connections_received:%lld\r\n"
6270 "total_commands_processed:%lld\r\n"
6271 "vm_enabled:%d\r\n"
6272 "role:%s\r\n"
6273 ,REDIS_VERSION,
6274 (sizeof(long) == 8) ? "64" : "32",
6275 aeGetApiName(),
6276 (long) getpid(),
6277 uptime,
6278 uptime/(3600*24),
6279 listLength(server.clients)-listLength(server.slaves),
6280 listLength(server.slaves),
6281 server.blpop_blocked_clients,
6282 zmalloc_used_memory(),
6283 hmem,
6284 server.dirty,
6285 server.bgsavechildpid != -1,
6286 server.lastsave,
6287 server.bgrewritechildpid != -1,
6288 server.stat_numconnections,
6289 server.stat_numcommands,
6290 server.vm_enabled != 0,
6291 server.masterhost == NULL ? "master" : "slave"
6292 );
6293 if (server.masterhost) {
6294 info = sdscatprintf(info,
6295 "master_host:%s\r\n"
6296 "master_port:%d\r\n"
6297 "master_link_status:%s\r\n"
6298 "master_last_io_seconds_ago:%d\r\n"
6299 ,server.masterhost,
6300 server.masterport,
6301 (server.replstate == REDIS_REPL_CONNECTED) ?
6302 "up" : "down",
6303 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6304 );
6305 }
6306 if (server.vm_enabled) {
6307 lockThreadedIO();
6308 info = sdscatprintf(info,
6309 "vm_conf_max_memory:%llu\r\n"
6310 "vm_conf_page_size:%llu\r\n"
6311 "vm_conf_pages:%llu\r\n"
6312 "vm_stats_used_pages:%llu\r\n"
6313 "vm_stats_swapped_objects:%llu\r\n"
6314 "vm_stats_swappin_count:%llu\r\n"
6315 "vm_stats_swappout_count:%llu\r\n"
6316 "vm_stats_io_newjobs_len:%lu\r\n"
6317 "vm_stats_io_processing_len:%lu\r\n"
6318 "vm_stats_io_processed_len:%lu\r\n"
6319 "vm_stats_io_active_threads:%lu\r\n"
6320 "vm_stats_blocked_clients:%lu\r\n"
6321 ,(unsigned long long) server.vm_max_memory,
6322 (unsigned long long) server.vm_page_size,
6323 (unsigned long long) server.vm_pages,
6324 (unsigned long long) server.vm_stats_used_pages,
6325 (unsigned long long) server.vm_stats_swapped_objects,
6326 (unsigned long long) server.vm_stats_swapins,
6327 (unsigned long long) server.vm_stats_swapouts,
6328 (unsigned long) listLength(server.io_newjobs),
6329 (unsigned long) listLength(server.io_processing),
6330 (unsigned long) listLength(server.io_processed),
6331 (unsigned long) server.io_active_threads,
6332 (unsigned long) server.vm_blocked_clients
6333 );
6334 unlockThreadedIO();
6335 }
6336 for (j = 0; j < server.dbnum; j++) {
6337 long long keys, vkeys;
6338
6339 keys = dictSize(server.db[j].dict);
6340 vkeys = dictSize(server.db[j].expires);
6341 if (keys || vkeys) {
6342 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6343 j, keys, vkeys);
6344 }
6345 }
6346 return info;
6347 }
6348
6349 static void infoCommand(redisClient *c) {
6350 sds info = genRedisInfoString();
6351 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6352 (unsigned long)sdslen(info)));
6353 addReplySds(c,info);
6354 addReply(c,shared.crlf);
6355 }
6356
6357 static void monitorCommand(redisClient *c) {
6358 /* ignore MONITOR if aleady slave or in monitor mode */
6359 if (c->flags & REDIS_SLAVE) return;
6360
6361 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6362 c->slaveseldb = 0;
6363 listAddNodeTail(server.monitors,c);
6364 addReply(c,shared.ok);
6365 }
6366
6367 /* ================================= Expire ================================= */
6368 static int removeExpire(redisDb *db, robj *key) {
6369 if (dictDelete(db->expires,key) == DICT_OK) {
6370 return 1;
6371 } else {
6372 return 0;
6373 }
6374 }
6375
6376 static int setExpire(redisDb *db, robj *key, time_t when) {
6377 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6378 return 0;
6379 } else {
6380 incrRefCount(key);
6381 return 1;
6382 }
6383 }
6384
6385 /* Return the expire time of the specified key, or -1 if no expire
6386 * is associated with this key (i.e. the key is non volatile) */
6387 static time_t getExpire(redisDb *db, robj *key) {
6388 dictEntry *de;
6389
6390 /* No expire? return ASAP */
6391 if (dictSize(db->expires) == 0 ||
6392 (de = dictFind(db->expires,key)) == NULL) return -1;
6393
6394 return (time_t) dictGetEntryVal(de);
6395 }
6396
6397 static int expireIfNeeded(redisDb *db, robj *key) {
6398 time_t when;
6399 dictEntry *de;
6400
6401 /* No expire? return ASAP */
6402 if (dictSize(db->expires) == 0 ||
6403 (de = dictFind(db->expires,key)) == NULL) return 0;
6404
6405 /* Lookup the expire */
6406 when = (time_t) dictGetEntryVal(de);
6407 if (time(NULL) <= when) return 0;
6408
6409 /* Delete the key */
6410 dictDelete(db->expires,key);
6411 return dictDelete(db->dict,key) == DICT_OK;
6412 }
6413
6414 static int deleteIfVolatile(redisDb *db, robj *key) {
6415 dictEntry *de;
6416
6417 /* No expire? return ASAP */
6418 if (dictSize(db->expires) == 0 ||
6419 (de = dictFind(db->expires,key)) == NULL) return 0;
6420
6421 /* Delete the key */
6422 server.dirty++;
6423 dictDelete(db->expires,key);
6424 return dictDelete(db->dict,key) == DICT_OK;
6425 }
6426
6427 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6428 dictEntry *de;
6429
6430 de = dictFind(c->db->dict,key);
6431 if (de == NULL) {
6432 addReply(c,shared.czero);
6433 return;
6434 }
6435 if (seconds < 0) {
6436 if (deleteKey(c->db,key)) server.dirty++;
6437 addReply(c, shared.cone);
6438 return;
6439 } else {
6440 time_t when = time(NULL)+seconds;
6441 if (setExpire(c->db,key,when)) {
6442 addReply(c,shared.cone);
6443 server.dirty++;
6444 } else {
6445 addReply(c,shared.czero);
6446 }
6447 return;
6448 }
6449 }
6450
6451 static void expireCommand(redisClient *c) {
6452 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6453 }
6454
6455 static void expireatCommand(redisClient *c) {
6456 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6457 }
6458
6459 static void ttlCommand(redisClient *c) {
6460 time_t expire;
6461 int ttl = -1;
6462
6463 expire = getExpire(c->db,c->argv[1]);
6464 if (expire != -1) {
6465 ttl = (int) (expire-time(NULL));
6466 if (ttl < 0) ttl = -1;
6467 }
6468 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6469 }
6470
6471 /* ================================ MULTI/EXEC ============================== */
6472
6473 /* Client state initialization for MULTI/EXEC */
6474 static void initClientMultiState(redisClient *c) {
6475 c->mstate.commands = NULL;
6476 c->mstate.count = 0;
6477 }
6478
6479 /* Release all the resources associated with MULTI/EXEC state */
6480 static void freeClientMultiState(redisClient *c) {
6481 int j;
6482
6483 for (j = 0; j < c->mstate.count; j++) {
6484 int i;
6485 multiCmd *mc = c->mstate.commands+j;
6486
6487 for (i = 0; i < mc->argc; i++)
6488 decrRefCount(mc->argv[i]);
6489 zfree(mc->argv);
6490 }
6491 zfree(c->mstate.commands);
6492 }
6493
6494 /* Add a new command into the MULTI commands queue */
6495 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6496 multiCmd *mc;
6497 int j;
6498
6499 c->mstate.commands = zrealloc(c->mstate.commands,
6500 sizeof(multiCmd)*(c->mstate.count+1));
6501 mc = c->mstate.commands+c->mstate.count;
6502 mc->cmd = cmd;
6503 mc->argc = c->argc;
6504 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6505 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6506 for (j = 0; j < c->argc; j++)
6507 incrRefCount(mc->argv[j]);
6508 c->mstate.count++;
6509 }
6510
6511 static void multiCommand(redisClient *c) {
6512 c->flags |= REDIS_MULTI;
6513 addReply(c,shared.ok);
6514 }
6515
6516 static void discardCommand(redisClient *c) {
6517 if (!(c->flags & REDIS_MULTI)) {
6518 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6519 return;
6520 }
6521
6522 freeClientMultiState(c);
6523 initClientMultiState(c);
6524 c->flags &= (~REDIS_MULTI);
6525 addReply(c,shared.ok);
6526 }
6527
6528 static void execCommand(redisClient *c) {
6529 int j;
6530 robj **orig_argv;
6531 int orig_argc;
6532
6533 if (!(c->flags & REDIS_MULTI)) {
6534 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6535 return;
6536 }
6537
6538 orig_argv = c->argv;
6539 orig_argc = c->argc;
6540 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6541 for (j = 0; j < c->mstate.count; j++) {
6542 c->argc = c->mstate.commands[j].argc;
6543 c->argv = c->mstate.commands[j].argv;
6544 call(c,c->mstate.commands[j].cmd);
6545 }
6546 c->argv = orig_argv;
6547 c->argc = orig_argc;
6548 freeClientMultiState(c);
6549 initClientMultiState(c);
6550 c->flags &= (~REDIS_MULTI);
6551 }
6552
6553 /* =========================== Blocking Operations ========================= */
6554
6555 /* Currently Redis blocking operations support is limited to list POP ops,
6556 * so the current implementation is not fully generic, but it is also not
6557 * completely specific so it will not require a rewrite to support new
6558 * kind of blocking operations in the future.
6559 *
6560 * Still it's important to note that list blocking operations can be already
6561 * used as a notification mechanism in order to implement other blocking
6562 * operations at application level, so there must be a very strong evidence
6563 * of usefulness and generality before new blocking operations are implemented.
6564 *
6565 * This is how the current blocking POP works, we use BLPOP as example:
6566 * - If the user calls BLPOP and the key exists and contains a non empty list
6567 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6568 * if there is not to block.
6569 * - If instead BLPOP is called and the key does not exists or the list is
6570 * empty we need to block. In order to do so we remove the notification for
6571 * new data to read in the client socket (so that we'll not serve new
6572 * requests if the blocking request is not served). Also we put the client
6573 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6574 * blocking for this keys.
6575 * - If a PUSH operation against a key with blocked clients waiting is
6576 * performed, we serve the first in the list: basically instead to push
6577 * the new element inside the list we return it to the (first / oldest)
6578 * blocking client, unblock the client, and remove it form the list.
6579 *
6580 * The above comment and the source code should be enough in order to understand
6581 * the implementation and modify / fix it later.
6582 */
6583
6584 /* Set a client in blocking mode for the specified key, with the specified
6585 * timeout */
6586 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6587 dictEntry *de;
6588 list *l;
6589 int j;
6590
6591 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6592 c->blockingkeysnum = numkeys;
6593 c->blockingto = timeout;
6594 for (j = 0; j < numkeys; j++) {
6595 /* Add the key in the client structure, to map clients -> keys */
6596 c->blockingkeys[j] = keys[j];
6597 incrRefCount(keys[j]);
6598
6599 /* And in the other "side", to map keys -> clients */
6600 de = dictFind(c->db->blockingkeys,keys[j]);
6601 if (de == NULL) {
6602 int retval;
6603
6604 /* For every key we take a list of clients blocked for it */
6605 l = listCreate();
6606 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6607 incrRefCount(keys[j]);
6608 assert(retval == DICT_OK);
6609 } else {
6610 l = dictGetEntryVal(de);
6611 }
6612 listAddNodeTail(l,c);
6613 }
6614 /* Mark the client as a blocked client */
6615 c->flags |= REDIS_BLOCKED;
6616 server.blpop_blocked_clients++;
6617 }
6618
6619 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6620 static void unblockClientWaitingData(redisClient *c) {
6621 dictEntry *de;
6622 list *l;
6623 int j;
6624
6625 assert(c->blockingkeys != NULL);
6626 /* The client may wait for multiple keys, so unblock it for every key. */
6627 for (j = 0; j < c->blockingkeysnum; j++) {
6628 /* Remove this client from the list of clients waiting for this key. */
6629 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6630 assert(de != NULL);
6631 l = dictGetEntryVal(de);
6632 listDelNode(l,listSearchKey(l,c));
6633 /* If the list is empty we need to remove it to avoid wasting memory */
6634 if (listLength(l) == 0)
6635 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6636 decrRefCount(c->blockingkeys[j]);
6637 }
6638 /* Cleanup the client structure */
6639 zfree(c->blockingkeys);
6640 c->blockingkeys = NULL;
6641 c->flags &= (~REDIS_BLOCKED);
6642 server.blpop_blocked_clients--;
6643 /* We want to process data if there is some command waiting
6644 * in the input buffer. Note that this is safe even if
6645 * unblockClientWaitingData() gets called from freeClient() because
6646 * freeClient() will be smart enough to call this function
6647 * *after* c->querybuf was set to NULL. */
6648 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6649 }
6650
6651 /* This should be called from any function PUSHing into lists.
6652 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6653 * 'ele' is the element pushed.
6654 *
6655 * If the function returns 0 there was no client waiting for a list push
6656 * against this key.
6657 *
6658 * If the function returns 1 there was a client waiting for a list push
6659 * against this key, the element was passed to this client thus it's not
6660 * needed to actually add it to the list and the caller should return asap. */
6661 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6662 struct dictEntry *de;
6663 redisClient *receiver;
6664 list *l;
6665 listNode *ln;
6666
6667 de = dictFind(c->db->blockingkeys,key);
6668 if (de == NULL) return 0;
6669 l = dictGetEntryVal(de);
6670 ln = listFirst(l);
6671 assert(ln != NULL);
6672 receiver = ln->value;
6673
6674 addReplySds(receiver,sdsnew("*2\r\n"));
6675 addReplyBulkLen(receiver,key);
6676 addReply(receiver,key);
6677 addReply(receiver,shared.crlf);
6678 addReplyBulkLen(receiver,ele);
6679 addReply(receiver,ele);
6680 addReply(receiver,shared.crlf);
6681 unblockClientWaitingData(receiver);
6682 return 1;
6683 }
6684
6685 /* Blocking RPOP/LPOP */
6686 static void blockingPopGenericCommand(redisClient *c, int where) {
6687 robj *o;
6688 time_t timeout;
6689 int j;
6690
6691 for (j = 1; j < c->argc-1; j++) {
6692 o = lookupKeyWrite(c->db,c->argv[j]);
6693 if (o != NULL) {
6694 if (o->type != REDIS_LIST) {
6695 addReply(c,shared.wrongtypeerr);
6696 return;
6697 } else {
6698 list *list = o->ptr;
6699 if (listLength(list) != 0) {
6700 /* If the list contains elements fall back to the usual
6701 * non-blocking POP operation */
6702 robj *argv[2], **orig_argv;
6703 int orig_argc;
6704
6705 /* We need to alter the command arguments before to call
6706 * popGenericCommand() as the command takes a single key. */
6707 orig_argv = c->argv;
6708 orig_argc = c->argc;
6709 argv[1] = c->argv[j];
6710 c->argv = argv;
6711 c->argc = 2;
6712
6713 /* Also the return value is different, we need to output
6714 * the multi bulk reply header and the key name. The
6715 * "real" command will add the last element (the value)
6716 * for us. If this souds like an hack to you it's just
6717 * because it is... */
6718 addReplySds(c,sdsnew("*2\r\n"));
6719 addReplyBulkLen(c,argv[1]);
6720 addReply(c,argv[1]);
6721 addReply(c,shared.crlf);
6722 popGenericCommand(c,where);
6723
6724 /* Fix the client structure with the original stuff */
6725 c->argv = orig_argv;
6726 c->argc = orig_argc;
6727 return;
6728 }
6729 }
6730 }
6731 }
6732 /* If the list is empty or the key does not exists we must block */
6733 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
6734 if (timeout > 0) timeout += time(NULL);
6735 blockForKeys(c,c->argv+1,c->argc-2,timeout);
6736 }
6737
6738 static void blpopCommand(redisClient *c) {
6739 blockingPopGenericCommand(c,REDIS_HEAD);
6740 }
6741
6742 static void brpopCommand(redisClient *c) {
6743 blockingPopGenericCommand(c,REDIS_TAIL);
6744 }
6745
6746 /* =============================== Replication ============================= */
6747
6748 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
6749 ssize_t nwritten, ret = size;
6750 time_t start = time(NULL);
6751
6752 timeout++;
6753 while(size) {
6754 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6755 nwritten = write(fd,ptr,size);
6756 if (nwritten == -1) return -1;
6757 ptr += nwritten;
6758 size -= nwritten;
6759 }
6760 if ((time(NULL)-start) > timeout) {
6761 errno = ETIMEDOUT;
6762 return -1;
6763 }
6764 }
6765 return ret;
6766 }
6767
6768 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
6769 ssize_t nread, totread = 0;
6770 time_t start = time(NULL);
6771
6772 timeout++;
6773 while(size) {
6774 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6775 nread = read(fd,ptr,size);
6776 if (nread == -1) return -1;
6777 ptr += nread;
6778 size -= nread;
6779 totread += nread;
6780 }
6781 if ((time(NULL)-start) > timeout) {
6782 errno = ETIMEDOUT;
6783 return -1;
6784 }
6785 }
6786 return totread;
6787 }
6788
6789 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6790 ssize_t nread = 0;
6791
6792 size--;
6793 while(size) {
6794 char c;
6795
6796 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6797 if (c == '\n') {
6798 *ptr = '\0';
6799 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6800 return nread;
6801 } else {
6802 *ptr++ = c;
6803 *ptr = '\0';
6804 nread++;
6805 }
6806 }
6807 return nread;
6808 }
6809
6810 static void syncCommand(redisClient *c) {
6811 /* ignore SYNC if aleady slave or in monitor mode */
6812 if (c->flags & REDIS_SLAVE) return;
6813
6814 /* SYNC can't be issued when the server has pending data to send to
6815 * the client about already issued commands. We need a fresh reply
6816 * buffer registering the differences between the BGSAVE and the current
6817 * dataset, so that we can copy to other slaves if needed. */
6818 if (listLength(c->reply) != 0) {
6819 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6820 return;
6821 }
6822
6823 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6824 /* Here we need to check if there is a background saving operation
6825 * in progress, or if it is required to start one */
6826 if (server.bgsavechildpid != -1) {
6827 /* Ok a background save is in progress. Let's check if it is a good
6828 * one for replication, i.e. if there is another slave that is
6829 * registering differences since the server forked to save */
6830 redisClient *slave;
6831 listNode *ln;
6832 listIter li;
6833
6834 listRewind(server.slaves,&li);
6835 while((ln = listNext(&li))) {
6836 slave = ln->value;
6837 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
6838 }
6839 if (ln) {
6840 /* Perfect, the server is already registering differences for
6841 * another slave. Set the right state, and copy the buffer. */
6842 listRelease(c->reply);
6843 c->reply = listDup(slave->reply);
6844 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6845 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6846 } else {
6847 /* No way, we need to wait for the next BGSAVE in order to
6848 * register differences */
6849 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6850 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6851 }
6852 } else {
6853 /* Ok we don't have a BGSAVE in progress, let's start one */
6854 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6855 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6856 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6857 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6858 return;
6859 }
6860 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6861 }
6862 c->repldbfd = -1;
6863 c->flags |= REDIS_SLAVE;
6864 c->slaveseldb = 0;
6865 listAddNodeTail(server.slaves,c);
6866 return;
6867 }
6868
6869 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6870 redisClient *slave = privdata;
6871 REDIS_NOTUSED(el);
6872 REDIS_NOTUSED(mask);
6873 char buf[REDIS_IOBUF_LEN];
6874 ssize_t nwritten, buflen;
6875
6876 if (slave->repldboff == 0) {
6877 /* Write the bulk write count before to transfer the DB. In theory here
6878 * we don't know how much room there is in the output buffer of the
6879 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6880 * operations) will never be smaller than the few bytes we need. */
6881 sds bulkcount;
6882
6883 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6884 slave->repldbsize);
6885 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6886 {
6887 sdsfree(bulkcount);
6888 freeClient(slave);
6889 return;
6890 }
6891 sdsfree(bulkcount);
6892 }
6893 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6894 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6895 if (buflen <= 0) {
6896 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6897 (buflen == 0) ? "premature EOF" : strerror(errno));
6898 freeClient(slave);
6899 return;
6900 }
6901 if ((nwritten = write(fd,buf,buflen)) == -1) {
6902 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6903 strerror(errno));
6904 freeClient(slave);
6905 return;
6906 }
6907 slave->repldboff += nwritten;
6908 if (slave->repldboff == slave->repldbsize) {
6909 close(slave->repldbfd);
6910 slave->repldbfd = -1;
6911 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6912 slave->replstate = REDIS_REPL_ONLINE;
6913 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
6914 sendReplyToClient, slave) == AE_ERR) {
6915 freeClient(slave);
6916 return;
6917 }
6918 addReplySds(slave,sdsempty());
6919 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6920 }
6921 }
6922
6923 /* This function is called at the end of every backgrond saving.
6924 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6925 * otherwise REDIS_ERR is passed to the function.
6926 *
6927 * The goal of this function is to handle slaves waiting for a successful
6928 * background saving in order to perform non-blocking synchronization. */
6929 static void updateSlavesWaitingBgsave(int bgsaveerr) {
6930 listNode *ln;
6931 int startbgsave = 0;
6932 listIter li;
6933
6934 listRewind(server.slaves,&li);
6935 while((ln = listNext(&li))) {
6936 redisClient *slave = ln->value;
6937
6938 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6939 startbgsave = 1;
6940 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6941 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
6942 struct redis_stat buf;
6943
6944 if (bgsaveerr != REDIS_OK) {
6945 freeClient(slave);
6946 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6947 continue;
6948 }
6949 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
6950 redis_fstat(slave->repldbfd,&buf) == -1) {
6951 freeClient(slave);
6952 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6953 continue;
6954 }
6955 slave->repldboff = 0;
6956 slave->repldbsize = buf.st_size;
6957 slave->replstate = REDIS_REPL_SEND_BULK;
6958 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6959 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6960 freeClient(slave);
6961 continue;
6962 }
6963 }
6964 }
6965 if (startbgsave) {
6966 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6967 listIter li;
6968
6969 listRewind(server.slaves,&li);
6970 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
6971 while((ln = listNext(&li))) {
6972 redisClient *slave = ln->value;
6973
6974 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6975 freeClient(slave);
6976 }
6977 }
6978 }
6979 }
6980
6981 static int syncWithMaster(void) {
6982 char buf[1024], tmpfile[256], authcmd[1024];
6983 long dumpsize;
6984 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6985 int dfd;
6986
6987 if (fd == -1) {
6988 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6989 strerror(errno));
6990 return REDIS_ERR;
6991 }
6992
6993 /* AUTH with the master if required. */
6994 if(server.masterauth) {
6995 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6996 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6997 close(fd);
6998 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6999 strerror(errno));
7000 return REDIS_ERR;
7001 }
7002 /* Read the AUTH result. */
7003 if (syncReadLine(fd,buf,1024,3600) == -1) {
7004 close(fd);
7005 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7006 strerror(errno));
7007 return REDIS_ERR;
7008 }
7009 if (buf[0] != '+') {
7010 close(fd);
7011 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7012 return REDIS_ERR;
7013 }
7014 }
7015
7016 /* Issue the SYNC command */
7017 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7018 close(fd);
7019 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7020 strerror(errno));
7021 return REDIS_ERR;
7022 }
7023 /* Read the bulk write count */
7024 if (syncReadLine(fd,buf,1024,3600) == -1) {
7025 close(fd);
7026 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7027 strerror(errno));
7028 return REDIS_ERR;
7029 }
7030 if (buf[0] != '$') {
7031 close(fd);
7032 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7033 return REDIS_ERR;
7034 }
7035 dumpsize = strtol(buf+1,NULL,10);
7036 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7037 /* Read the bulk write data on a temp file */
7038 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
7039 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
7040 if (dfd == -1) {
7041 close(fd);
7042 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7043 return REDIS_ERR;
7044 }
7045 while(dumpsize) {
7046 int nread, nwritten;
7047
7048 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7049 if (nread == -1) {
7050 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7051 strerror(errno));
7052 close(fd);
7053 close(dfd);
7054 return REDIS_ERR;
7055 }
7056 nwritten = write(dfd,buf,nread);
7057 if (nwritten == -1) {
7058 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7059 close(fd);
7060 close(dfd);
7061 return REDIS_ERR;
7062 }
7063 dumpsize -= nread;
7064 }
7065 close(dfd);
7066 if (rename(tmpfile,server.dbfilename) == -1) {
7067 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7068 unlink(tmpfile);
7069 close(fd);
7070 return REDIS_ERR;
7071 }
7072 emptyDb();
7073 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7074 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7075 close(fd);
7076 return REDIS_ERR;
7077 }
7078 server.master = createClient(fd);
7079 server.master->flags |= REDIS_MASTER;
7080 server.master->authenticated = 1;
7081 server.replstate = REDIS_REPL_CONNECTED;
7082 return REDIS_OK;
7083 }
7084
7085 static void slaveofCommand(redisClient *c) {
7086 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7087 !strcasecmp(c->argv[2]->ptr,"one")) {
7088 if (server.masterhost) {
7089 sdsfree(server.masterhost);
7090 server.masterhost = NULL;
7091 if (server.master) freeClient(server.master);
7092 server.replstate = REDIS_REPL_NONE;
7093 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7094 }
7095 } else {
7096 sdsfree(server.masterhost);
7097 server.masterhost = sdsdup(c->argv[1]->ptr);
7098 server.masterport = atoi(c->argv[2]->ptr);
7099 if (server.master) freeClient(server.master);
7100 server.replstate = REDIS_REPL_CONNECT;
7101 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7102 server.masterhost, server.masterport);
7103 }
7104 addReply(c,shared.ok);
7105 }
7106
7107 /* ============================ Maxmemory directive ======================== */
7108
7109 /* Try to free one object form the pre-allocated objects free list.
7110 * This is useful under low mem conditions as by default we take 1 million
7111 * free objects allocated. On success REDIS_OK is returned, otherwise
7112 * REDIS_ERR. */
7113 static int tryFreeOneObjectFromFreelist(void) {
7114 robj *o;
7115
7116 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7117 if (listLength(server.objfreelist)) {
7118 listNode *head = listFirst(server.objfreelist);
7119 o = listNodeValue(head);
7120 listDelNode(server.objfreelist,head);
7121 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7122 zfree(o);
7123 return REDIS_OK;
7124 } else {
7125 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7126 return REDIS_ERR;
7127 }
7128 }
7129
7130 /* This function gets called when 'maxmemory' is set on the config file to limit
7131 * the max memory used by the server, and we are out of memory.
7132 * This function will try to, in order:
7133 *
7134 * - Free objects from the free list
7135 * - Try to remove keys with an EXPIRE set
7136 *
7137 * It is not possible to free enough memory to reach used-memory < maxmemory
7138 * the server will start refusing commands that will enlarge even more the
7139 * memory usage.
7140 */
7141 static void freeMemoryIfNeeded(void) {
7142 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7143 int j, k, freed = 0;
7144
7145 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7146 for (j = 0; j < server.dbnum; j++) {
7147 int minttl = -1;
7148 robj *minkey = NULL;
7149 struct dictEntry *de;
7150
7151 if (dictSize(server.db[j].expires)) {
7152 freed = 1;
7153 /* From a sample of three keys drop the one nearest to
7154 * the natural expire */
7155 for (k = 0; k < 3; k++) {
7156 time_t t;
7157
7158 de = dictGetRandomKey(server.db[j].expires);
7159 t = (time_t) dictGetEntryVal(de);
7160 if (minttl == -1 || t < minttl) {
7161 minkey = dictGetEntryKey(de);
7162 minttl = t;
7163 }
7164 }
7165 deleteKey(server.db+j,minkey);
7166 }
7167 }
7168 if (!freed) return; /* nothing to free... */
7169 }
7170 }
7171
7172 /* ============================== Append Only file ========================== */
7173
7174 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7175 sds buf = sdsempty();
7176 int j;
7177 ssize_t nwritten;
7178 time_t now;
7179 robj *tmpargv[3];
7180
7181 /* The DB this command was targetting is not the same as the last command
7182 * we appendend. To issue a SELECT command is needed. */
7183 if (dictid != server.appendseldb) {
7184 char seldb[64];
7185
7186 snprintf(seldb,sizeof(seldb),"%d",dictid);
7187 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7188 (unsigned long)strlen(seldb),seldb);
7189 server.appendseldb = dictid;
7190 }
7191
7192 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7193 * EXPIREs into EXPIREATs calls */
7194 if (cmd->proc == expireCommand) {
7195 long when;
7196
7197 tmpargv[0] = createStringObject("EXPIREAT",8);
7198 tmpargv[1] = argv[1];
7199 incrRefCount(argv[1]);
7200 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7201 tmpargv[2] = createObject(REDIS_STRING,
7202 sdscatprintf(sdsempty(),"%ld",when));
7203 argv = tmpargv;
7204 }
7205
7206 /* Append the actual command */
7207 buf = sdscatprintf(buf,"*%d\r\n",argc);
7208 for (j = 0; j < argc; j++) {
7209 robj *o = argv[j];
7210
7211 o = getDecodedObject(o);
7212 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7213 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7214 buf = sdscatlen(buf,"\r\n",2);
7215 decrRefCount(o);
7216 }
7217
7218 /* Free the objects from the modified argv for EXPIREAT */
7219 if (cmd->proc == expireCommand) {
7220 for (j = 0; j < 3; j++)
7221 decrRefCount(argv[j]);
7222 }
7223
7224 /* We want to perform a single write. This should be guaranteed atomic
7225 * at least if the filesystem we are writing is a real physical one.
7226 * While this will save us against the server being killed I don't think
7227 * there is much to do about the whole server stopping for power problems
7228 * or alike */
7229 nwritten = write(server.appendfd,buf,sdslen(buf));
7230 if (nwritten != (signed)sdslen(buf)) {
7231 /* Ooops, we are in troubles. The best thing to do for now is
7232 * to simply exit instead to give the illusion that everything is
7233 * working as expected. */
7234 if (nwritten == -1) {
7235 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7236 } else {
7237 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7238 }
7239 exit(1);
7240 }
7241 /* If a background append only file rewriting is in progress we want to
7242 * accumulate the differences between the child DB and the current one
7243 * in a buffer, so that when the child process will do its work we
7244 * can append the differences to the new append only file. */
7245 if (server.bgrewritechildpid != -1)
7246 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7247
7248 sdsfree(buf);
7249 now = time(NULL);
7250 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7251 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7252 now-server.lastfsync > 1))
7253 {
7254 fsync(server.appendfd); /* Let's try to get this data on the disk */
7255 server.lastfsync = now;
7256 }
7257 }
7258
7259 /* In Redis commands are always executed in the context of a client, so in
7260 * order to load the append only file we need to create a fake client. */
7261 static struct redisClient *createFakeClient(void) {
7262 struct redisClient *c = zmalloc(sizeof(*c));
7263
7264 selectDb(c,0);
7265 c->fd = -1;
7266 c->querybuf = sdsempty();
7267 c->argc = 0;
7268 c->argv = NULL;
7269 c->flags = 0;
7270 /* We set the fake client as a slave waiting for the synchronization
7271 * so that Redis will not try to send replies to this client. */
7272 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7273 c->reply = listCreate();
7274 listSetFreeMethod(c->reply,decrRefCount);
7275 listSetDupMethod(c->reply,dupClientReplyValue);
7276 return c;
7277 }
7278
7279 static void freeFakeClient(struct redisClient *c) {
7280 sdsfree(c->querybuf);
7281 listRelease(c->reply);
7282 zfree(c);
7283 }
7284
7285 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7286 * error (the append only file is zero-length) REDIS_ERR is returned. On
7287 * fatal error an error message is logged and the program exists. */
7288 int loadAppendOnlyFile(char *filename) {
7289 struct redisClient *fakeClient;
7290 FILE *fp = fopen(filename,"r");
7291 struct redis_stat sb;
7292 unsigned long long loadedkeys = 0;
7293
7294 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7295 return REDIS_ERR;
7296
7297 if (fp == NULL) {
7298 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7299 exit(1);
7300 }
7301
7302 fakeClient = createFakeClient();
7303 while(1) {
7304 int argc, j;
7305 unsigned long len;
7306 robj **argv;
7307 char buf[128];
7308 sds argsds;
7309 struct redisCommand *cmd;
7310
7311 if (fgets(buf,sizeof(buf),fp) == NULL) {
7312 if (feof(fp))
7313 break;
7314 else
7315 goto readerr;
7316 }
7317 if (buf[0] != '*') goto fmterr;
7318 argc = atoi(buf+1);
7319 argv = zmalloc(sizeof(robj*)*argc);
7320 for (j = 0; j < argc; j++) {
7321 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7322 if (buf[0] != '$') goto fmterr;
7323 len = strtol(buf+1,NULL,10);
7324 argsds = sdsnewlen(NULL,len);
7325 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7326 argv[j] = createObject(REDIS_STRING,argsds);
7327 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7328 }
7329
7330 /* Command lookup */
7331 cmd = lookupCommand(argv[0]->ptr);
7332 if (!cmd) {
7333 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7334 exit(1);
7335 }
7336 /* Try object sharing and encoding */
7337 if (server.shareobjects) {
7338 int j;
7339 for(j = 1; j < argc; j++)
7340 argv[j] = tryObjectSharing(argv[j]);
7341 }
7342 if (cmd->flags & REDIS_CMD_BULK)
7343 tryObjectEncoding(argv[argc-1]);
7344 /* Run the command in the context of a fake client */
7345 fakeClient->argc = argc;
7346 fakeClient->argv = argv;
7347 cmd->proc(fakeClient);
7348 /* Discard the reply objects list from the fake client */
7349 while(listLength(fakeClient->reply))
7350 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7351 /* Clean up, ready for the next command */
7352 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7353 zfree(argv);
7354 /* Handle swapping while loading big datasets when VM is on */
7355 loadedkeys++;
7356 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7357 while (zmalloc_used_memory() > server.vm_max_memory) {
7358 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7359 }
7360 }
7361 }
7362 fclose(fp);
7363 freeFakeClient(fakeClient);
7364 return REDIS_OK;
7365
7366 readerr:
7367 if (feof(fp)) {
7368 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7369 } else {
7370 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7371 }
7372 exit(1);
7373 fmterr:
7374 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7375 exit(1);
7376 }
7377
7378 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7379 static int fwriteBulk(FILE *fp, robj *obj) {
7380 char buf[128];
7381 int decrrc = 0;
7382
7383 /* Avoid the incr/decr ref count business if possible to help
7384 * copy-on-write (we are often in a child process when this function
7385 * is called).
7386 * Also makes sure that key objects don't get incrRefCount-ed when VM
7387 * is enabled */
7388 if (obj->encoding != REDIS_ENCODING_RAW) {
7389 obj = getDecodedObject(obj);
7390 decrrc = 1;
7391 }
7392 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7393 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7394 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7395 goto err;
7396 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7397 if (decrrc) decrRefCount(obj);
7398 return 1;
7399 err:
7400 if (decrrc) decrRefCount(obj);
7401 return 0;
7402 }
7403
7404 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7405 static int fwriteBulkDouble(FILE *fp, double d) {
7406 char buf[128], dbuf[128];
7407
7408 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7409 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7410 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7411 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7412 return 1;
7413 }
7414
7415 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7416 static int fwriteBulkLong(FILE *fp, long l) {
7417 char buf[128], lbuf[128];
7418
7419 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7420 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7421 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7422 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7423 return 1;
7424 }
7425
7426 /* Write a sequence of commands able to fully rebuild the dataset into
7427 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7428 static int rewriteAppendOnlyFile(char *filename) {
7429 dictIterator *di = NULL;
7430 dictEntry *de;
7431 FILE *fp;
7432 char tmpfile[256];
7433 int j;
7434 time_t now = time(NULL);
7435
7436 /* Note that we have to use a different temp name here compared to the
7437 * one used by rewriteAppendOnlyFileBackground() function. */
7438 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7439 fp = fopen(tmpfile,"w");
7440 if (!fp) {
7441 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7442 return REDIS_ERR;
7443 }
7444 for (j = 0; j < server.dbnum; j++) {
7445 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7446 redisDb *db = server.db+j;
7447 dict *d = db->dict;
7448 if (dictSize(d) == 0) continue;
7449 di = dictGetIterator(d);
7450 if (!di) {
7451 fclose(fp);
7452 return REDIS_ERR;
7453 }
7454
7455 /* SELECT the new DB */
7456 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7457 if (fwriteBulkLong(fp,j) == 0) goto werr;
7458
7459 /* Iterate this DB writing every entry */
7460 while((de = dictNext(di)) != NULL) {
7461 robj *key, *o;
7462 time_t expiretime;
7463 int swapped;
7464
7465 key = dictGetEntryKey(de);
7466 /* If the value for this key is swapped, load a preview in memory.
7467 * We use a "swapped" flag to remember if we need to free the
7468 * value object instead to just increment the ref count anyway
7469 * in order to avoid copy-on-write of pages if we are forked() */
7470 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7471 key->storage == REDIS_VM_SWAPPING) {
7472 o = dictGetEntryVal(de);
7473 swapped = 0;
7474 } else {
7475 o = vmPreviewObject(key);
7476 swapped = 1;
7477 }
7478 expiretime = getExpire(db,key);
7479
7480 /* Save the key and associated value */
7481 if (o->type == REDIS_STRING) {
7482 /* Emit a SET command */
7483 char cmd[]="*3\r\n$3\r\nSET\r\n";
7484 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7485 /* Key and value */
7486 if (fwriteBulk(fp,key) == 0) goto werr;
7487 if (fwriteBulk(fp,o) == 0) goto werr;
7488 } else if (o->type == REDIS_LIST) {
7489 /* Emit the RPUSHes needed to rebuild the list */
7490 list *list = o->ptr;
7491 listNode *ln;
7492 listIter li;
7493
7494 listRewind(list,&li);
7495 while((ln = listNext(&li))) {
7496 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7497 robj *eleobj = listNodeValue(ln);
7498
7499 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7500 if (fwriteBulk(fp,key) == 0) goto werr;
7501 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7502 }
7503 } else if (o->type == REDIS_SET) {
7504 /* Emit the SADDs needed to rebuild the set */
7505 dict *set = o->ptr;
7506 dictIterator *di = dictGetIterator(set);
7507 dictEntry *de;
7508
7509 while((de = dictNext(di)) != NULL) {
7510 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7511 robj *eleobj = dictGetEntryKey(de);
7512
7513 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7514 if (fwriteBulk(fp,key) == 0) goto werr;
7515 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7516 }
7517 dictReleaseIterator(di);
7518 } else if (o->type == REDIS_ZSET) {
7519 /* Emit the ZADDs needed to rebuild the sorted set */
7520 zset *zs = o->ptr;
7521 dictIterator *di = dictGetIterator(zs->dict);
7522 dictEntry *de;
7523
7524 while((de = dictNext(di)) != NULL) {
7525 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7526 robj *eleobj = dictGetEntryKey(de);
7527 double *score = dictGetEntryVal(de);
7528
7529 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7530 if (fwriteBulk(fp,key) == 0) goto werr;
7531 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7532 if (fwriteBulk(fp,eleobj) == 0) goto werr;
7533 }
7534 dictReleaseIterator(di);
7535 } else {
7536 redisAssert(0 != 0);
7537 }
7538 /* Save the expire time */
7539 if (expiretime != -1) {
7540 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7541 /* If this key is already expired skip it */
7542 if (expiretime < now) continue;
7543 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7544 if (fwriteBulk(fp,key) == 0) goto werr;
7545 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7546 }
7547 if (swapped) decrRefCount(o);
7548 }
7549 dictReleaseIterator(di);
7550 }
7551
7552 /* Make sure data will not remain on the OS's output buffers */
7553 fflush(fp);
7554 fsync(fileno(fp));
7555 fclose(fp);
7556
7557 /* Use RENAME to make sure the DB file is changed atomically only
7558 * if the generate DB file is ok. */
7559 if (rename(tmpfile,filename) == -1) {
7560 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7561 unlink(tmpfile);
7562 return REDIS_ERR;
7563 }
7564 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7565 return REDIS_OK;
7566
7567 werr:
7568 fclose(fp);
7569 unlink(tmpfile);
7570 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7571 if (di) dictReleaseIterator(di);
7572 return REDIS_ERR;
7573 }
7574
7575 /* This is how rewriting of the append only file in background works:
7576 *
7577 * 1) The user calls BGREWRITEAOF
7578 * 2) Redis calls this function, that forks():
7579 * 2a) the child rewrite the append only file in a temp file.
7580 * 2b) the parent accumulates differences in server.bgrewritebuf.
7581 * 3) When the child finished '2a' exists.
7582 * 4) The parent will trap the exit code, if it's OK, will append the
7583 * data accumulated into server.bgrewritebuf into the temp file, and
7584 * finally will rename(2) the temp file in the actual file name.
7585 * The the new file is reopened as the new append only file. Profit!
7586 */
7587 static int rewriteAppendOnlyFileBackground(void) {
7588 pid_t childpid;
7589
7590 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7591 if (server.vm_enabled) waitEmptyIOJobsQueue();
7592 if ((childpid = fork()) == 0) {
7593 /* Child */
7594 char tmpfile[256];
7595
7596 if (server.vm_enabled) vmReopenSwapFile();
7597 close(server.fd);
7598 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7599 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7600 _exit(0);
7601 } else {
7602 _exit(1);
7603 }
7604 } else {
7605 /* Parent */
7606 if (childpid == -1) {
7607 redisLog(REDIS_WARNING,
7608 "Can't rewrite append only file in background: fork: %s",
7609 strerror(errno));
7610 return REDIS_ERR;
7611 }
7612 redisLog(REDIS_NOTICE,
7613 "Background append only file rewriting started by pid %d",childpid);
7614 server.bgrewritechildpid = childpid;
7615 /* We set appendseldb to -1 in order to force the next call to the
7616 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7617 * accumulated by the parent into server.bgrewritebuf will start
7618 * with a SELECT statement and it will be safe to merge. */
7619 server.appendseldb = -1;
7620 return REDIS_OK;
7621 }
7622 return REDIS_OK; /* unreached */
7623 }
7624
7625 static void bgrewriteaofCommand(redisClient *c) {
7626 if (server.bgrewritechildpid != -1) {
7627 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7628 return;
7629 }
7630 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7631 char *status = "+Background append only file rewriting started\r\n";
7632 addReplySds(c,sdsnew(status));
7633 } else {
7634 addReply(c,shared.err);
7635 }
7636 }
7637
7638 static void aofRemoveTempFile(pid_t childpid) {
7639 char tmpfile[256];
7640
7641 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7642 unlink(tmpfile);
7643 }
7644
7645 /* Virtual Memory is composed mainly of two subsystems:
7646 * - Blocking Virutal Memory
7647 * - Threaded Virtual Memory I/O
7648 * The two parts are not fully decoupled, but functions are split among two
7649 * different sections of the source code (delimited by comments) in order to
7650 * make more clear what functionality is about the blocking VM and what about
7651 * the threaded (not blocking) VM.
7652 *
7653 * Redis VM design:
7654 *
7655 * Redis VM is a blocking VM (one that blocks reading swapped values from
7656 * disk into memory when a value swapped out is needed in memory) that is made
7657 * unblocking by trying to examine the command argument vector in order to
7658 * load in background values that will likely be needed in order to exec
7659 * the command. The command is executed only once all the relevant keys
7660 * are loaded into memory.
7661 *
7662 * This basically is almost as simple of a blocking VM, but almost as parallel
7663 * as a fully non-blocking VM.
7664 */
7665
7666 /* =================== Virtual Memory - Blocking Side ====================== */
7667
7668 /* substitute the first occurrence of '%p' with the process pid in the
7669 * swap file name. */
7670 static void expandVmSwapFilename(void) {
7671 char *p = strstr(server.vm_swap_file,"%p");
7672 sds new;
7673
7674 if (!p) return;
7675 new = sdsempty();
7676 *p = '\0';
7677 new = sdscat(new,server.vm_swap_file);
7678 new = sdscatprintf(new,"%ld",(long) getpid());
7679 new = sdscat(new,p+2);
7680 zfree(server.vm_swap_file);
7681 server.vm_swap_file = new;
7682 }
7683
7684 static void vmInit(void) {
7685 off_t totsize;
7686 int pipefds[2];
7687 size_t stacksize;
7688
7689 if (server.vm_max_threads != 0)
7690 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7691
7692 expandVmSwapFilename();
7693 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
7694 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
7695 server.vm_fp = fopen(server.vm_swap_file,"w+b");
7696 }
7697 if (server.vm_fp == NULL) {
7698 redisLog(REDIS_WARNING,
7699 "Impossible to open the swap file: %s. Exiting.",
7700 strerror(errno));
7701 exit(1);
7702 }
7703 server.vm_fd = fileno(server.vm_fp);
7704 server.vm_next_page = 0;
7705 server.vm_near_pages = 0;
7706 server.vm_stats_used_pages = 0;
7707 server.vm_stats_swapped_objects = 0;
7708 server.vm_stats_swapouts = 0;
7709 server.vm_stats_swapins = 0;
7710 totsize = server.vm_pages*server.vm_page_size;
7711 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
7712 if (ftruncate(server.vm_fd,totsize) == -1) {
7713 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
7714 strerror(errno));
7715 exit(1);
7716 } else {
7717 redisLog(REDIS_NOTICE,"Swap file allocated with success");
7718 }
7719 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
7720 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
7721 (long long) (server.vm_pages+7)/8, server.vm_pages);
7722 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
7723
7724 /* Initialize threaded I/O (used by Virtual Memory) */
7725 server.io_newjobs = listCreate();
7726 server.io_processing = listCreate();
7727 server.io_processed = listCreate();
7728 server.io_ready_clients = listCreate();
7729 pthread_mutex_init(&server.io_mutex,NULL);
7730 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
7731 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
7732 server.io_active_threads = 0;
7733 if (pipe(pipefds) == -1) {
7734 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
7735 ,strerror(errno));
7736 exit(1);
7737 }
7738 server.io_ready_pipe_read = pipefds[0];
7739 server.io_ready_pipe_write = pipefds[1];
7740 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
7741 /* LZF requires a lot of stack */
7742 pthread_attr_init(&server.io_threads_attr);
7743 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
7744 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
7745 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
7746 /* Listen for events in the threaded I/O pipe */
7747 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
7748 vmThreadedIOCompletedJob, NULL) == AE_ERR)
7749 oom("creating file event");
7750 }
7751
7752 /* Mark the page as used */
7753 static void vmMarkPageUsed(off_t page) {
7754 off_t byte = page/8;
7755 int bit = page&7;
7756 redisAssert(vmFreePage(page) == 1);
7757 server.vm_bitmap[byte] |= 1<<bit;
7758 }
7759
7760 /* Mark N contiguous pages as used, with 'page' being the first. */
7761 static void vmMarkPagesUsed(off_t page, off_t count) {
7762 off_t j;
7763
7764 for (j = 0; j < count; j++)
7765 vmMarkPageUsed(page+j);
7766 server.vm_stats_used_pages += count;
7767 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
7768 (long long)count, (long long)page);
7769 }
7770
7771 /* Mark the page as free */
7772 static void vmMarkPageFree(off_t page) {
7773 off_t byte = page/8;
7774 int bit = page&7;
7775 redisAssert(vmFreePage(page) == 0);
7776 server.vm_bitmap[byte] &= ~(1<<bit);
7777 }
7778
7779 /* Mark N contiguous pages as free, with 'page' being the first. */
7780 static void vmMarkPagesFree(off_t page, off_t count) {
7781 off_t j;
7782
7783 for (j = 0; j < count; j++)
7784 vmMarkPageFree(page+j);
7785 server.vm_stats_used_pages -= count;
7786 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
7787 (long long)count, (long long)page);
7788 }
7789
7790 /* Test if the page is free */
7791 static int vmFreePage(off_t page) {
7792 off_t byte = page/8;
7793 int bit = page&7;
7794 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
7795 }
7796
7797 /* Find N contiguous free pages storing the first page of the cluster in *first.
7798 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7799 * REDIS_ERR is returned.
7800 *
7801 * This function uses a simple algorithm: we try to allocate
7802 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7803 * again from the start of the swap file searching for free spaces.
7804 *
7805 * If it looks pretty clear that there are no free pages near our offset
7806 * we try to find less populated places doing a forward jump of
7807 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7808 * without hurry, and then we jump again and so forth...
7809 *
7810 * This function can be improved using a free list to avoid to guess
7811 * too much, since we could collect data about freed pages.
7812 *
7813 * note: I implemented this function just after watching an episode of
7814 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7815 */
7816 static int vmFindContiguousPages(off_t *first, off_t n) {
7817 off_t base, offset = 0, since_jump = 0, numfree = 0;
7818
7819 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7820 server.vm_near_pages = 0;
7821 server.vm_next_page = 0;
7822 }
7823 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7824 base = server.vm_next_page;
7825
7826 while(offset < server.vm_pages) {
7827 off_t this = base+offset;
7828
7829 /* If we overflow, restart from page zero */
7830 if (this >= server.vm_pages) {
7831 this -= server.vm_pages;
7832 if (this == 0) {
7833 /* Just overflowed, what we found on tail is no longer
7834 * interesting, as it's no longer contiguous. */
7835 numfree = 0;
7836 }
7837 }
7838 if (vmFreePage(this)) {
7839 /* This is a free page */
7840 numfree++;
7841 /* Already got N free pages? Return to the caller, with success */
7842 if (numfree == n) {
7843 *first = this-(n-1);
7844 server.vm_next_page = this+1;
7845 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
7846 return REDIS_OK;
7847 }
7848 } else {
7849 /* The current one is not a free page */
7850 numfree = 0;
7851 }
7852
7853 /* Fast-forward if the current page is not free and we already
7854 * searched enough near this place. */
7855 since_jump++;
7856 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7857 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7858 since_jump = 0;
7859 /* Note that even if we rewind after the jump, we are don't need
7860 * to make sure numfree is set to zero as we only jump *if* it
7861 * is set to zero. */
7862 } else {
7863 /* Otherwise just check the next page */
7864 offset++;
7865 }
7866 }
7867 return REDIS_ERR;
7868 }
7869
7870 /* Write the specified object at the specified page of the swap file */
7871 static int vmWriteObjectOnSwap(robj *o, off_t page) {
7872 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7873 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7874 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7875 redisLog(REDIS_WARNING,
7876 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
7877 strerror(errno));
7878 return REDIS_ERR;
7879 }
7880 rdbSaveObject(server.vm_fp,o);
7881 fflush(server.vm_fp);
7882 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7883 return REDIS_OK;
7884 }
7885
7886 /* Swap the 'val' object relative to 'key' into disk. Store all the information
7887 * needed to later retrieve the object into the key object.
7888 * If we can't find enough contiguous empty pages to swap the object on disk
7889 * REDIS_ERR is returned. */
7890 static int vmSwapObjectBlocking(robj *key, robj *val) {
7891 off_t pages = rdbSavedObjectPages(val,NULL);
7892 off_t page;
7893
7894 assert(key->storage == REDIS_VM_MEMORY);
7895 assert(key->refcount == 1);
7896 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
7897 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
7898 key->vm.page = page;
7899 key->vm.usedpages = pages;
7900 key->storage = REDIS_VM_SWAPPED;
7901 key->vtype = val->type;
7902 decrRefCount(val); /* Deallocate the object from memory. */
7903 vmMarkPagesUsed(page,pages);
7904 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7905 (unsigned char*) key->ptr,
7906 (unsigned long long) page, (unsigned long long) pages);
7907 server.vm_stats_swapped_objects++;
7908 server.vm_stats_swapouts++;
7909 return REDIS_OK;
7910 }
7911
7912 static robj *vmReadObjectFromSwap(off_t page, int type) {
7913 robj *o;
7914
7915 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7916 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7917 redisLog(REDIS_WARNING,
7918 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
7919 strerror(errno));
7920 _exit(1);
7921 }
7922 o = rdbLoadObject(type,server.vm_fp);
7923 if (o == NULL) {
7924 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
7925 _exit(1);
7926 }
7927 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7928 return o;
7929 }
7930
7931 /* Load the value object relative to the 'key' object from swap to memory.
7932 * The newly allocated object is returned.
7933 *
7934 * If preview is true the unserialized object is returned to the caller but
7935 * no changes are made to the key object, nor the pages are marked as freed */
7936 static robj *vmGenericLoadObject(robj *key, int preview) {
7937 robj *val;
7938
7939 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
7940 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7941 if (!preview) {
7942 key->storage = REDIS_VM_MEMORY;
7943 key->vm.atime = server.unixtime;
7944 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7945 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7946 (unsigned char*) key->ptr);
7947 server.vm_stats_swapped_objects--;
7948 } else {
7949 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7950 (unsigned char*) key->ptr);
7951 }
7952 server.vm_stats_swapins++;
7953 return val;
7954 }
7955
7956 /* Plain object loading, from swap to memory */
7957 static robj *vmLoadObject(robj *key) {
7958 /* If we are loading the object in background, stop it, we
7959 * need to load this object synchronously ASAP. */
7960 if (key->storage == REDIS_VM_LOADING)
7961 vmCancelThreadedIOJob(key);
7962 return vmGenericLoadObject(key,0);
7963 }
7964
7965 /* Just load the value on disk, without to modify the key.
7966 * This is useful when we want to perform some operation on the value
7967 * without to really bring it from swap to memory, like while saving the
7968 * dataset or rewriting the append only log. */
7969 static robj *vmPreviewObject(robj *key) {
7970 return vmGenericLoadObject(key,1);
7971 }
7972
7973 /* How a good candidate is this object for swapping?
7974 * The better candidate it is, the greater the returned value.
7975 *
7976 * Currently we try to perform a fast estimation of the object size in
7977 * memory, and combine it with aging informations.
7978 *
7979 * Basically swappability = idle-time * log(estimated size)
7980 *
7981 * Bigger objects are preferred over smaller objects, but not
7982 * proportionally, this is why we use the logarithm. This algorithm is
7983 * just a first try and will probably be tuned later. */
7984 static double computeObjectSwappability(robj *o) {
7985 time_t age = server.unixtime - o->vm.atime;
7986 long asize = 0;
7987 list *l;
7988 dict *d;
7989 struct dictEntry *de;
7990 int z;
7991
7992 if (age <= 0) return 0;
7993 switch(o->type) {
7994 case REDIS_STRING:
7995 if (o->encoding != REDIS_ENCODING_RAW) {
7996 asize = sizeof(*o);
7997 } else {
7998 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7999 }
8000 break;
8001 case REDIS_LIST:
8002 l = o->ptr;
8003 listNode *ln = listFirst(l);
8004
8005 asize = sizeof(list);
8006 if (ln) {
8007 robj *ele = ln->value;
8008 long elesize;
8009
8010 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8011 (sizeof(*o)+sdslen(ele->ptr)) :
8012 sizeof(*o);
8013 asize += (sizeof(listNode)+elesize)*listLength(l);
8014 }
8015 break;
8016 case REDIS_SET:
8017 case REDIS_ZSET:
8018 z = (o->type == REDIS_ZSET);
8019 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8020
8021 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8022 if (z) asize += sizeof(zset)-sizeof(dict);
8023 if (dictSize(d)) {
8024 long elesize;
8025 robj *ele;
8026
8027 de = dictGetRandomKey(d);
8028 ele = dictGetEntryKey(de);
8029 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8030 (sizeof(*o)+sdslen(ele->ptr)) :
8031 sizeof(*o);
8032 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8033 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8034 }
8035 break;
8036 }
8037 return (double)age*log(1+asize);
8038 }
8039
8040 /* Try to swap an object that's a good candidate for swapping.
8041 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8042 * to swap any object at all.
8043 *
8044 * If 'usethreaded' is true, Redis will try to swap the object in background
8045 * using I/O threads. */
8046 static int vmSwapOneObject(int usethreads) {
8047 int j, i;
8048 struct dictEntry *best = NULL;
8049 double best_swappability = 0;
8050 redisDb *best_db = NULL;
8051 robj *key, *val;
8052
8053 for (j = 0; j < server.dbnum; j++) {
8054 redisDb *db = server.db+j;
8055 /* Why maxtries is set to 100?
8056 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8057 * are swappable objects */
8058 int maxtries = 100;
8059
8060 if (dictSize(db->dict) == 0) continue;
8061 for (i = 0; i < 5; i++) {
8062 dictEntry *de;
8063 double swappability;
8064
8065 if (maxtries) maxtries--;
8066 de = dictGetRandomKey(db->dict);
8067 key = dictGetEntryKey(de);
8068 val = dictGetEntryVal(de);
8069 /* Only swap objects that are currently in memory.
8070 *
8071 * Also don't swap shared objects if threaded VM is on, as we
8072 * try to ensure that the main thread does not touch the
8073 * object while the I/O thread is using it, but we can't
8074 * control other keys without adding additional mutex. */
8075 if (key->storage != REDIS_VM_MEMORY ||
8076 (server.vm_max_threads != 0 && val->refcount != 1)) {
8077 if (maxtries) i--; /* don't count this try */
8078 continue;
8079 }
8080 swappability = computeObjectSwappability(val);
8081 if (!best || swappability > best_swappability) {
8082 best = de;
8083 best_swappability = swappability;
8084 best_db = db;
8085 }
8086 }
8087 }
8088 if (best == NULL) return REDIS_ERR;
8089 key = dictGetEntryKey(best);
8090 val = dictGetEntryVal(best);
8091
8092 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8093 key->ptr, best_swappability);
8094
8095 /* Unshare the key if needed */
8096 if (key->refcount > 1) {
8097 robj *newkey = dupStringObject(key);
8098 decrRefCount(key);
8099 key = dictGetEntryKey(best) = newkey;
8100 }
8101 /* Swap it */
8102 if (usethreads) {
8103 vmSwapObjectThreaded(key,val,best_db);
8104 return REDIS_OK;
8105 } else {
8106 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8107 dictGetEntryVal(best) = NULL;
8108 return REDIS_OK;
8109 } else {
8110 return REDIS_ERR;
8111 }
8112 }
8113 }
8114
8115 static int vmSwapOneObjectBlocking() {
8116 return vmSwapOneObject(0);
8117 }
8118
8119 static int vmSwapOneObjectThreaded() {
8120 return vmSwapOneObject(1);
8121 }
8122
8123 /* Return true if it's safe to swap out objects in a given moment.
8124 * Basically we don't want to swap objects out while there is a BGSAVE
8125 * or a BGAEOREWRITE running in backgroud. */
8126 static int vmCanSwapOut(void) {
8127 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8128 }
8129
8130 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8131 * and was deleted. Otherwise 0 is returned. */
8132 static int deleteIfSwapped(redisDb *db, robj *key) {
8133 dictEntry *de;
8134 robj *foundkey;
8135
8136 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8137 foundkey = dictGetEntryKey(de);
8138 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8139 deleteKey(db,key);
8140 return 1;
8141 }
8142
8143 /* =================== Virtual Memory - Threaded I/O ======================= */
8144
8145 static void freeIOJob(iojob *j) {
8146 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8147 j->type == REDIS_IOJOB_DO_SWAP ||
8148 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8149 decrRefCount(j->val);
8150 decrRefCount(j->key);
8151 zfree(j);
8152 }
8153
8154 /* Every time a thread finished a Job, it writes a byte into the write side
8155 * of an unix pipe in order to "awake" the main thread, and this function
8156 * is called. */
8157 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8158 int mask)
8159 {
8160 char buf[1];
8161 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8162 REDIS_NOTUSED(el);
8163 REDIS_NOTUSED(mask);
8164 REDIS_NOTUSED(privdata);
8165
8166 /* For every byte we read in the read side of the pipe, there is one
8167 * I/O job completed to process. */
8168 while((retval = read(fd,buf,1)) == 1) {
8169 iojob *j;
8170 listNode *ln;
8171 robj *key;
8172 struct dictEntry *de;
8173
8174 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8175
8176 /* Get the processed element (the oldest one) */
8177 lockThreadedIO();
8178 assert(listLength(server.io_processed) != 0);
8179 if (toprocess == -1) {
8180 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8181 if (toprocess <= 0) toprocess = 1;
8182 }
8183 ln = listFirst(server.io_processed);
8184 j = ln->value;
8185 listDelNode(server.io_processed,ln);
8186 unlockThreadedIO();
8187 /* If this job is marked as canceled, just ignore it */
8188 if (j->canceled) {
8189 freeIOJob(j);
8190 continue;
8191 }
8192 /* Post process it in the main thread, as there are things we
8193 * can do just here to avoid race conditions and/or invasive locks */
8194 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8195 de = dictFind(j->db->dict,j->key);
8196 assert(de != NULL);
8197 key = dictGetEntryKey(de);
8198 if (j->type == REDIS_IOJOB_LOAD) {
8199 redisDb *db;
8200
8201 /* Key loaded, bring it at home */
8202 key->storage = REDIS_VM_MEMORY;
8203 key->vm.atime = server.unixtime;
8204 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8205 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8206 (unsigned char*) key->ptr);
8207 server.vm_stats_swapped_objects--;
8208 server.vm_stats_swapins++;
8209 dictGetEntryVal(de) = j->val;
8210 incrRefCount(j->val);
8211 db = j->db;
8212 freeIOJob(j);
8213 /* Handle clients waiting for this key to be loaded. */
8214 handleClientsBlockedOnSwappedKey(db,key);
8215 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8216 /* Now we know the amount of pages required to swap this object.
8217 * Let's find some space for it, and queue this task again
8218 * rebranded as REDIS_IOJOB_DO_SWAP. */
8219 if (!vmCanSwapOut() ||
8220 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8221 {
8222 /* Ooops... no space or we can't swap as there is
8223 * a fork()ed Redis trying to save stuff on disk. */
8224 freeIOJob(j);
8225 key->storage = REDIS_VM_MEMORY; /* undo operation */
8226 } else {
8227 /* Note that we need to mark this pages as used now,
8228 * if the job will be canceled, we'll mark them as freed
8229 * again. */
8230 vmMarkPagesUsed(j->page,j->pages);
8231 j->type = REDIS_IOJOB_DO_SWAP;
8232 lockThreadedIO();
8233 queueIOJob(j);
8234 unlockThreadedIO();
8235 }
8236 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8237 robj *val;
8238
8239 /* Key swapped. We can finally free some memory. */
8240 if (key->storage != REDIS_VM_SWAPPING) {
8241 printf("key->storage: %d\n",key->storage);
8242 printf("key->name: %s\n",(char*)key->ptr);
8243 printf("key->refcount: %d\n",key->refcount);
8244 printf("val: %p\n",(void*)j->val);
8245 printf("val->type: %d\n",j->val->type);
8246 printf("val->ptr: %s\n",(char*)j->val->ptr);
8247 }
8248 redisAssert(key->storage == REDIS_VM_SWAPPING);
8249 val = dictGetEntryVal(de);
8250 key->vm.page = j->page;
8251 key->vm.usedpages = j->pages;
8252 key->storage = REDIS_VM_SWAPPED;
8253 key->vtype = j->val->type;
8254 decrRefCount(val); /* Deallocate the object from memory. */
8255 dictGetEntryVal(de) = NULL;
8256 redisLog(REDIS_DEBUG,
8257 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8258 (unsigned char*) key->ptr,
8259 (unsigned long long) j->page, (unsigned long long) j->pages);
8260 server.vm_stats_swapped_objects++;
8261 server.vm_stats_swapouts++;
8262 freeIOJob(j);
8263 /* Put a few more swap requests in queue if we are still
8264 * out of memory */
8265 if (trytoswap && vmCanSwapOut() &&
8266 zmalloc_used_memory() > server.vm_max_memory)
8267 {
8268 int more = 1;
8269 while(more) {
8270 lockThreadedIO();
8271 more = listLength(server.io_newjobs) <
8272 (unsigned) server.vm_max_threads;
8273 unlockThreadedIO();
8274 /* Don't waste CPU time if swappable objects are rare. */
8275 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8276 trytoswap = 0;
8277 break;
8278 }
8279 }
8280 }
8281 }
8282 processed++;
8283 if (processed == toprocess) return;
8284 }
8285 if (retval < 0 && errno != EAGAIN) {
8286 redisLog(REDIS_WARNING,
8287 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8288 strerror(errno));
8289 }
8290 }
8291
8292 static void lockThreadedIO(void) {
8293 pthread_mutex_lock(&server.io_mutex);
8294 }
8295
8296 static void unlockThreadedIO(void) {
8297 pthread_mutex_unlock(&server.io_mutex);
8298 }
8299
8300 /* Remove the specified object from the threaded I/O queue if still not
8301 * processed, otherwise make sure to flag it as canceled. */
8302 static void vmCancelThreadedIOJob(robj *o) {
8303 list *lists[3] = {
8304 server.io_newjobs, /* 0 */
8305 server.io_processing, /* 1 */
8306 server.io_processed /* 2 */
8307 };
8308 int i;
8309
8310 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8311 again:
8312 lockThreadedIO();
8313 /* Search for a matching key in one of the queues */
8314 for (i = 0; i < 3; i++) {
8315 listNode *ln;
8316 listIter li;
8317
8318 listRewind(lists[i],&li);
8319 while ((ln = listNext(&li)) != NULL) {
8320 iojob *job = ln->value;
8321
8322 if (job->canceled) continue; /* Skip this, already canceled. */
8323 if (compareStringObjects(job->key,o) == 0) {
8324 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8325 (void*)job, (char*)o->ptr, job->type, i);
8326 /* Mark the pages as free since the swap didn't happened
8327 * or happened but is now discarded. */
8328 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8329 vmMarkPagesFree(job->page,job->pages);
8330 /* Cancel the job. It depends on the list the job is
8331 * living in. */
8332 switch(i) {
8333 case 0: /* io_newjobs */
8334 /* If the job was yet not processed the best thing to do
8335 * is to remove it from the queue at all */
8336 freeIOJob(job);
8337 listDelNode(lists[i],ln);
8338 break;
8339 case 1: /* io_processing */
8340 /* Oh Shi- the thread is messing with the Job:
8341 *
8342 * Probably it's accessing the object if this is a
8343 * PREPARE_SWAP or DO_SWAP job.
8344 * If it's a LOAD job it may be reading from disk and
8345 * if we don't wait for the job to terminate before to
8346 * cancel it, maybe in a few microseconds data can be
8347 * corrupted in this pages. So the short story is:
8348 *
8349 * Better to wait for the job to move into the
8350 * next queue (processed)... */
8351
8352 /* We try again and again until the job is completed. */
8353 unlockThreadedIO();
8354 /* But let's wait some time for the I/O thread
8355 * to finish with this job. After all this condition
8356 * should be very rare. */
8357 usleep(1);
8358 goto again;
8359 case 2: /* io_processed */
8360 /* The job was already processed, that's easy...
8361 * just mark it as canceled so that we'll ignore it
8362 * when processing completed jobs. */
8363 job->canceled = 1;
8364 break;
8365 }
8366 /* Finally we have to adjust the storage type of the object
8367 * in order to "UNDO" the operaiton. */
8368 if (o->storage == REDIS_VM_LOADING)
8369 o->storage = REDIS_VM_SWAPPED;
8370 else if (o->storage == REDIS_VM_SWAPPING)
8371 o->storage = REDIS_VM_MEMORY;
8372 unlockThreadedIO();
8373 return;
8374 }
8375 }
8376 }
8377 unlockThreadedIO();
8378 assert(1 != 1); /* We should never reach this */
8379 }
8380
8381 static void *IOThreadEntryPoint(void *arg) {
8382 iojob *j;
8383 listNode *ln;
8384 REDIS_NOTUSED(arg);
8385
8386 pthread_detach(pthread_self());
8387 while(1) {
8388 /* Get a new job to process */
8389 lockThreadedIO();
8390 if (listLength(server.io_newjobs) == 0) {
8391 /* No new jobs in queue, exit. */
8392 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8393 (long) pthread_self());
8394 server.io_active_threads--;
8395 unlockThreadedIO();
8396 return NULL;
8397 }
8398 ln = listFirst(server.io_newjobs);
8399 j = ln->value;
8400 listDelNode(server.io_newjobs,ln);
8401 /* Add the job in the processing queue */
8402 j->thread = pthread_self();
8403 listAddNodeTail(server.io_processing,j);
8404 ln = listLast(server.io_processing); /* We use ln later to remove it */
8405 unlockThreadedIO();
8406 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8407 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8408
8409 /* Process the Job */
8410 if (j->type == REDIS_IOJOB_LOAD) {
8411 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8412 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8413 FILE *fp = fopen("/dev/null","w+");
8414 j->pages = rdbSavedObjectPages(j->val,fp);
8415 fclose(fp);
8416 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8417 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8418 j->canceled = 1;
8419 }
8420
8421 /* Done: insert the job into the processed queue */
8422 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8423 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8424 lockThreadedIO();
8425 listDelNode(server.io_processing,ln);
8426 listAddNodeTail(server.io_processed,j);
8427 unlockThreadedIO();
8428
8429 /* Signal the main thread there is new stuff to process */
8430 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8431 }
8432 return NULL; /* never reached */
8433 }
8434
8435 static void spawnIOThread(void) {
8436 pthread_t thread;
8437 sigset_t mask, omask;
8438
8439 sigemptyset(&mask);
8440 sigaddset(&mask,SIGCHLD);
8441 sigaddset(&mask,SIGHUP);
8442 sigaddset(&mask,SIGPIPE);
8443 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8444 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
8445 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8446 server.io_active_threads++;
8447 }
8448
8449 /* We need to wait for the last thread to exit before we are able to
8450 * fork() in order to BGSAVE or BGREWRITEAOF. */
8451 static void waitEmptyIOJobsQueue(void) {
8452 while(1) {
8453 int io_processed_len;
8454
8455 lockThreadedIO();
8456 if (listLength(server.io_newjobs) == 0 &&
8457 listLength(server.io_processing) == 0 &&
8458 server.io_active_threads == 0)
8459 {
8460 unlockThreadedIO();
8461 return;
8462 }
8463 /* While waiting for empty jobs queue condition we post-process some
8464 * finshed job, as I/O threads may be hanging trying to write against
8465 * the io_ready_pipe_write FD but there are so much pending jobs that
8466 * it's blocking. */
8467 io_processed_len = listLength(server.io_processed);
8468 unlockThreadedIO();
8469 if (io_processed_len) {
8470 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8471 usleep(1000); /* 1 millisecond */
8472 } else {
8473 usleep(10000); /* 10 milliseconds */
8474 }
8475 }
8476 }
8477
8478 static void vmReopenSwapFile(void) {
8479 /* Note: we don't close the old one as we are in the child process
8480 * and don't want to mess at all with the original file object. */
8481 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8482 if (server.vm_fp == NULL) {
8483 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8484 server.vm_swap_file);
8485 _exit(1);
8486 }
8487 server.vm_fd = fileno(server.vm_fp);
8488 }
8489
8490 /* This function must be called while with threaded IO locked */
8491 static void queueIOJob(iojob *j) {
8492 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8493 (void*)j, j->type, (char*)j->key->ptr);
8494 listAddNodeTail(server.io_newjobs,j);
8495 if (server.io_active_threads < server.vm_max_threads)
8496 spawnIOThread();
8497 }
8498
8499 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8500 iojob *j;
8501
8502 assert(key->storage == REDIS_VM_MEMORY);
8503 assert(key->refcount == 1);
8504
8505 j = zmalloc(sizeof(*j));
8506 j->type = REDIS_IOJOB_PREPARE_SWAP;
8507 j->db = db;
8508 j->key = dupStringObject(key);
8509 j->val = val;
8510 incrRefCount(val);
8511 j->canceled = 0;
8512 j->thread = (pthread_t) -1;
8513 key->storage = REDIS_VM_SWAPPING;
8514
8515 lockThreadedIO();
8516 queueIOJob(j);
8517 unlockThreadedIO();
8518 return REDIS_OK;
8519 }
8520
8521 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8522
8523 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8524 * If there is not already a job loading the key, it is craeted.
8525 * The key is added to the io_keys list in the client structure, and also
8526 * in the hash table mapping swapped keys to waiting clients, that is,
8527 * server.io_waited_keys. */
8528 static int waitForSwappedKey(redisClient *c, robj *key) {
8529 struct dictEntry *de;
8530 robj *o;
8531 list *l;
8532
8533 /* If the key does not exist or is already in RAM we don't need to
8534 * block the client at all. */
8535 de = dictFind(c->db->dict,key);
8536 if (de == NULL) return 0;
8537 o = dictGetEntryKey(de);
8538 if (o->storage == REDIS_VM_MEMORY) {
8539 return 0;
8540 } else if (o->storage == REDIS_VM_SWAPPING) {
8541 /* We were swapping the key, undo it! */
8542 vmCancelThreadedIOJob(o);
8543 return 0;
8544 }
8545
8546 /* OK: the key is either swapped, or being loaded just now. */
8547
8548 /* Add the key to the list of keys this client is waiting for.
8549 * This maps clients to keys they are waiting for. */
8550 listAddNodeTail(c->io_keys,key);
8551 incrRefCount(key);
8552
8553 /* Add the client to the swapped keys => clients waiting map. */
8554 de = dictFind(c->db->io_keys,key);
8555 if (de == NULL) {
8556 int retval;
8557
8558 /* For every key we take a list of clients blocked for it */
8559 l = listCreate();
8560 retval = dictAdd(c->db->io_keys,key,l);
8561 incrRefCount(key);
8562 assert(retval == DICT_OK);
8563 } else {
8564 l = dictGetEntryVal(de);
8565 }
8566 listAddNodeTail(l,c);
8567
8568 /* Are we already loading the key from disk? If not create a job */
8569 if (o->storage == REDIS_VM_SWAPPED) {
8570 iojob *j;
8571
8572 o->storage = REDIS_VM_LOADING;
8573 j = zmalloc(sizeof(*j));
8574 j->type = REDIS_IOJOB_LOAD;
8575 j->db = c->db;
8576 j->key = dupStringObject(key);
8577 j->key->vtype = o->vtype;
8578 j->page = o->vm.page;
8579 j->val = NULL;
8580 j->canceled = 0;
8581 j->thread = (pthread_t) -1;
8582 lockThreadedIO();
8583 queueIOJob(j);
8584 unlockThreadedIO();
8585 }
8586 return 1;
8587 }
8588
8589 /* Is this client attempting to run a command against swapped keys?
8590 * If so, block it ASAP, load the keys in background, then resume it.
8591 *
8592 * The important idea about this function is that it can fail! If keys will
8593 * still be swapped when the client is resumed, this key lookups will
8594 * just block loading keys from disk. In practical terms this should only
8595 * happen with SORT BY command or if there is a bug in this function.
8596 *
8597 * Return 1 if the client is marked as blocked, 0 if the client can
8598 * continue as the keys it is going to access appear to be in memory. */
8599 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8600 int j, last;
8601
8602 if (cmd->vm_firstkey == 0) return 0;
8603 last = cmd->vm_lastkey;
8604 if (last < 0) last = c->argc+last;
8605 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8606 waitForSwappedKey(c,c->argv[j]);
8607 /* If the client was blocked for at least one key, mark it as blocked. */
8608 if (listLength(c->io_keys)) {
8609 c->flags |= REDIS_IO_WAIT;
8610 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8611 server.vm_blocked_clients++;
8612 return 1;
8613 } else {
8614 return 0;
8615 }
8616 }
8617
8618 /* Remove the 'key' from the list of blocked keys for a given client.
8619 *
8620 * The function returns 1 when there are no longer blocking keys after
8621 * the current one was removed (and the client can be unblocked). */
8622 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8623 list *l;
8624 listNode *ln;
8625 listIter li;
8626 struct dictEntry *de;
8627
8628 /* Remove the key from the list of keys this client is waiting for. */
8629 listRewind(c->io_keys,&li);
8630 while ((ln = listNext(&li)) != NULL) {
8631 if (compareStringObjects(ln->value,key) == 0) {
8632 listDelNode(c->io_keys,ln);
8633 break;
8634 }
8635 }
8636 assert(ln != NULL);
8637
8638 /* Remove the client form the key => waiting clients map. */
8639 de = dictFind(c->db->io_keys,key);
8640 assert(de != NULL);
8641 l = dictGetEntryVal(de);
8642 ln = listSearchKey(l,c);
8643 assert(ln != NULL);
8644 listDelNode(l,ln);
8645 if (listLength(l) == 0)
8646 dictDelete(c->db->io_keys,key);
8647
8648 return listLength(c->io_keys) == 0;
8649 }
8650
8651 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8652 struct dictEntry *de;
8653 list *l;
8654 listNode *ln;
8655 int len;
8656
8657 de = dictFind(db->io_keys,key);
8658 if (!de) return;
8659
8660 l = dictGetEntryVal(de);
8661 len = listLength(l);
8662 /* Note: we can't use something like while(listLength(l)) as the list
8663 * can be freed by the calling function when we remove the last element. */
8664 while (len--) {
8665 ln = listFirst(l);
8666 redisClient *c = ln->value;
8667
8668 if (dontWaitForSwappedKey(c,key)) {
8669 /* Put the client in the list of clients ready to go as we
8670 * loaded all the keys about it. */
8671 listAddNodeTail(server.io_ready_clients,c);
8672 }
8673 }
8674 }
8675
8676 /* ================================= Debugging ============================== */
8677
8678 static void debugCommand(redisClient *c) {
8679 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
8680 *((char*)-1) = 'x';
8681 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
8682 if (rdbSave(server.dbfilename) != REDIS_OK) {
8683 addReply(c,shared.err);
8684 return;
8685 }
8686 emptyDb();
8687 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8688 addReply(c,shared.err);
8689 return;
8690 }
8691 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
8692 addReply(c,shared.ok);
8693 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
8694 emptyDb();
8695 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
8696 addReply(c,shared.err);
8697 return;
8698 }
8699 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
8700 addReply(c,shared.ok);
8701 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
8702 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8703 robj *key, *val;
8704
8705 if (!de) {
8706 addReply(c,shared.nokeyerr);
8707 return;
8708 }
8709 key = dictGetEntryKey(de);
8710 val = dictGetEntryVal(de);
8711 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
8712 key->storage == REDIS_VM_SWAPPING)) {
8713 addReplySds(c,sdscatprintf(sdsempty(),
8714 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8715 "encoding:%d serializedlength:%lld\r\n",
8716 (void*)key, key->refcount, (void*)val, val->refcount,
8717 val->encoding, (long long) rdbSavedObjectLen(val,NULL)));
8718 } else {
8719 addReplySds(c,sdscatprintf(sdsempty(),
8720 "+Key at:%p refcount:%d, value swapped at: page %llu "
8721 "using %llu pages\r\n",
8722 (void*)key, key->refcount, (unsigned long long) key->vm.page,
8723 (unsigned long long) key->vm.usedpages));
8724 }
8725 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
8726 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8727 robj *key, *val;
8728
8729 if (!server.vm_enabled) {
8730 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8731 return;
8732 }
8733 if (!de) {
8734 addReply(c,shared.nokeyerr);
8735 return;
8736 }
8737 key = dictGetEntryKey(de);
8738 val = dictGetEntryVal(de);
8739 /* If the key is shared we want to create a copy */
8740 if (key->refcount > 1) {
8741 robj *newkey = dupStringObject(key);
8742 decrRefCount(key);
8743 key = dictGetEntryKey(de) = newkey;
8744 }
8745 /* Swap it */
8746 if (key->storage != REDIS_VM_MEMORY) {
8747 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
8748 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8749 dictGetEntryVal(de) = NULL;
8750 addReply(c,shared.ok);
8751 } else {
8752 addReply(c,shared.err);
8753 }
8754 } else {
8755 addReplySds(c,sdsnew(
8756 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
8757 }
8758 }
8759
8760 static void _redisAssert(char *estr, char *file, int line) {
8761 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
8762 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
8763 #ifdef HAVE_BACKTRACE
8764 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
8765 *((char*)-1) = 'x';
8766 #endif
8767 }
8768
8769 /* =================================== Main! ================================ */
8770
8771 #ifdef __linux__
8772 int linuxOvercommitMemoryValue(void) {
8773 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
8774 char buf[64];
8775
8776 if (!fp) return -1;
8777 if (fgets(buf,64,fp) == NULL) {
8778 fclose(fp);
8779 return -1;
8780 }
8781 fclose(fp);
8782
8783 return atoi(buf);
8784 }
8785
8786 void linuxOvercommitMemoryWarning(void) {
8787 if (linuxOvercommitMemoryValue() == 0) {
8788 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8789 }
8790 }
8791 #endif /* __linux__ */
8792
8793 static void daemonize(void) {
8794 int fd;
8795 FILE *fp;
8796
8797 if (fork() != 0) exit(0); /* parent exits */
8798 setsid(); /* create a new session */
8799
8800 /* Every output goes to /dev/null. If Redis is daemonized but
8801 * the 'logfile' is set to 'stdout' in the configuration file
8802 * it will not log at all. */
8803 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
8804 dup2(fd, STDIN_FILENO);
8805 dup2(fd, STDOUT_FILENO);
8806 dup2(fd, STDERR_FILENO);
8807 if (fd > STDERR_FILENO) close(fd);
8808 }
8809 /* Try to write the pid file */
8810 fp = fopen(server.pidfile,"w");
8811 if (fp) {
8812 fprintf(fp,"%d\n",getpid());
8813 fclose(fp);
8814 }
8815 }
8816
8817 int main(int argc, char **argv) {
8818 time_t start;
8819
8820 initServerConfig();
8821 if (argc == 2) {
8822 resetServerSaveParams();
8823 loadServerConfig(argv[1]);
8824 } else if (argc > 2) {
8825 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
8826 exit(1);
8827 } else {
8828 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8829 }
8830 if (server.daemonize) daemonize();
8831 initServer();
8832 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
8833 #ifdef __linux__
8834 linuxOvercommitMemoryWarning();
8835 #endif
8836 start = time(NULL);
8837 if (server.appendonly) {
8838 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
8839 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
8840 } else {
8841 if (rdbLoad(server.dbfilename) == REDIS_OK)
8842 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
8843 }
8844 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
8845 aeSetBeforeSleepProc(server.el,beforeSleep);
8846 aeMain(server.el);
8847 aeDeleteEventLoop(server.el);
8848 return 0;
8849 }
8850
8851 /* ============================= Backtrace support ========================= */
8852
8853 #ifdef HAVE_BACKTRACE
8854 static char *findFuncName(void *pointer, unsigned long *offset);
8855
8856 static void *getMcontextEip(ucontext_t *uc) {
8857 #if defined(__FreeBSD__)
8858 return (void*) uc->uc_mcontext.mc_eip;
8859 #elif defined(__dietlibc__)
8860 return (void*) uc->uc_mcontext.eip;
8861 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
8862 #if __x86_64__
8863 return (void*) uc->uc_mcontext->__ss.__rip;
8864 #else
8865 return (void*) uc->uc_mcontext->__ss.__eip;
8866 #endif
8867 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
8868 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
8869 return (void*) uc->uc_mcontext->__ss.__rip;
8870 #else
8871 return (void*) uc->uc_mcontext->__ss.__eip;
8872 #endif
8873 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8874 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
8875 #elif defined(__ia64__) /* Linux IA64 */
8876 return (void*) uc->uc_mcontext.sc_ip;
8877 #else
8878 return NULL;
8879 #endif
8880 }
8881
8882 static void segvHandler(int sig, siginfo_t *info, void *secret) {
8883 void *trace[100];
8884 char **messages = NULL;
8885 int i, trace_size = 0;
8886 unsigned long offset=0;
8887 ucontext_t *uc = (ucontext_t*) secret;
8888 sds infostring;
8889 REDIS_NOTUSED(info);
8890
8891 redisLog(REDIS_WARNING,
8892 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
8893 infostring = genRedisInfoString();
8894 redisLog(REDIS_WARNING, "%s",infostring);
8895 /* It's not safe to sdsfree() the returned string under memory
8896 * corruption conditions. Let it leak as we are going to abort */
8897
8898 trace_size = backtrace(trace, 100);
8899 /* overwrite sigaction with caller's address */
8900 if (getMcontextEip(uc) != NULL) {
8901 trace[1] = getMcontextEip(uc);
8902 }
8903 messages = backtrace_symbols(trace, trace_size);
8904
8905 for (i=1; i<trace_size; ++i) {
8906 char *fn = findFuncName(trace[i], &offset), *p;
8907
8908 p = strchr(messages[i],'+');
8909 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
8910 redisLog(REDIS_WARNING,"%s", messages[i]);
8911 } else {
8912 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
8913 }
8914 }
8915 /* free(messages); Don't call free() with possibly corrupted memory. */
8916 _exit(0);
8917 }
8918
8919 static void setupSigSegvAction(void) {
8920 struct sigaction act;
8921
8922 sigemptyset (&act.sa_mask);
8923 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8924 * is used. Otherwise, sa_handler is used */
8925 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
8926 act.sa_sigaction = segvHandler;
8927 sigaction (SIGSEGV, &act, NULL);
8928 sigaction (SIGBUS, &act, NULL);
8929 sigaction (SIGFPE, &act, NULL);
8930 sigaction (SIGILL, &act, NULL);
8931 sigaction (SIGBUS, &act, NULL);
8932 return;
8933 }
8934
8935 #include "staticsymbols.h"
8936 /* This function try to convert a pointer into a function name. It's used in
8937 * oreder to provide a backtrace under segmentation fault that's able to
8938 * display functions declared as static (otherwise the backtrace is useless). */
8939 static char *findFuncName(void *pointer, unsigned long *offset){
8940 int i, ret = -1;
8941 unsigned long off, minoff = 0;
8942
8943 /* Try to match against the Symbol with the smallest offset */
8944 for (i=0; symsTable[i].pointer; i++) {
8945 unsigned long lp = (unsigned long) pointer;
8946
8947 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
8948 off=lp-symsTable[i].pointer;
8949 if (ret < 0 || off < minoff) {
8950 minoff=off;
8951 ret=i;
8952 }
8953 }
8954 }
8955 if (ret == -1) return NULL;
8956 *offset = minoff;
8957 return symsTable[ret].name;
8958 }
8959 #else /* HAVE_BACKTRACE */
8960 static void setupSigSegvAction(void) {
8961 }
8962 #endif /* HAVE_BACKTRACE */
8963
8964
8965
8966 /* The End */
8967
8968
8969