]> git.saurik.com Git - redis.git/blob - redis.c
An interesting refactoring + more expressive internal API
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.4"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 4
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132 };
133
134 /* Object types only used for dumping to disk */
135 #define REDIS_EXPIRETIME 253
136 #define REDIS_SELECTDB 254
137 #define REDIS_EOF 255
138
139 /* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
149 *
150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
152 #define REDIS_RDB_6BITLEN 0
153 #define REDIS_RDB_14BITLEN 1
154 #define REDIS_RDB_32BITLEN 2
155 #define REDIS_RDB_ENCVAL 3
156 #define REDIS_RDB_LENERR UINT_MAX
157
158 /* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
164 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
165
166 /* Virtual memory object->where field. */
167 #define REDIS_VM_MEMORY 0 /* The object is on memory */
168 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
169 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
172 /* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174 #define REDIS_VM_MAX_NEAR_PAGES 65536
175 #define REDIS_VM_MAX_RANDOM_JUMP 4096
176 #define REDIS_VM_MAX_THREADS 32
177 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
178 /* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
182 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
183
184 /* Client flags */
185 #define REDIS_SLAVE 1 /* This client is a slave server */
186 #define REDIS_MASTER 2 /* This client is a master server */
187 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188 #define REDIS_MULTI 8 /* This client is in a MULTI context */
189 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
191
192 /* Slave replication state - slave side */
193 #define REDIS_REPL_NONE 0 /* No active replication */
194 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
195 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
197 /* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
206 /* List related stuff */
207 #define REDIS_HEAD 0
208 #define REDIS_TAIL 1
209
210 /* Sort operations */
211 #define REDIS_SORT_GET 0
212 #define REDIS_SORT_ASC 1
213 #define REDIS_SORT_DESC 2
214 #define REDIS_SORTKEY_MAX 1024
215
216 /* Log levels */
217 #define REDIS_DEBUG 0
218 #define REDIS_VERBOSE 1
219 #define REDIS_NOTICE 2
220 #define REDIS_WARNING 3
221
222 /* Anti-warning macro... */
223 #define REDIS_NOTUSED(V) ((void) V)
224
225 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
227
228 /* Append only defines */
229 #define APPENDFSYNC_NO 0
230 #define APPENDFSYNC_ALWAYS 1
231 #define APPENDFSYNC_EVERYSEC 2
232
233 /* Hashes related defaults */
234 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
237 /* We can print the stacktrace, so our assert is defined this way: */
238 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
239 static void _redisAssert(char *estr, char *file, int line);
240
241 /*================================= Data types ============================== */
242
243 /* A redis object, that is a type able to hold a string / list / set */
244
245 /* The VM object structure */
246 struct redisObjectVM {
247 off_t page; /* the page at witch the object is stored on disk */
248 off_t usedpages; /* number of pages used on disk */
249 time_t atime; /* Last access time */
250 } vm;
251
252 /* The actual Redis Object */
253 typedef struct redisObject {
254 void *ptr;
255 unsigned char type;
256 unsigned char encoding;
257 unsigned char storage; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
261 int refcount;
262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm;
267 } robj;
268
269 /* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273 #define initStaticStringObject(_var,_ptr) do { \
274 _var.refcount = 1; \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
277 _var.ptr = _ptr; \
278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
279 } while(0);
280
281 typedef struct redisDb {
282 dict *dict; /* The keyspace for this DB */
283 dict *expires; /* Timeout of keys with a timeout set */
284 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
285 dict *io_keys; /* Keys with clients waiting for VM I/O */
286 int id;
287 } redisDb;
288
289 /* Client MULTI/EXEC state */
290 typedef struct multiCmd {
291 robj **argv;
292 int argc;
293 struct redisCommand *cmd;
294 } multiCmd;
295
296 typedef struct multiState {
297 multiCmd *commands; /* Array of MULTI commands */
298 int count; /* Total number of MULTI commands */
299 } multiState;
300
301 /* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303 typedef struct redisClient {
304 int fd;
305 redisDb *db;
306 int dictid;
307 sds querybuf;
308 robj **argv, **mbargv;
309 int argc, mbargc;
310 int bulklen; /* bulk read len. -1 if not in bulk read mode */
311 int multibulk; /* multi bulk command format active */
312 list *reply;
313 int sentlen;
314 time_t lastinteraction; /* time of the last interaction, used for timeout */
315 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
316 int slaveseldb; /* slave selected db, if this client is a slave */
317 int authenticated; /* when requirepass is non-NULL */
318 int replstate; /* replication state if this is a slave */
319 int repldbfd; /* replication DB file descriptor */
320 long repldboff; /* replication DB file offset */
321 off_t repldbsize; /* replication DB file size */
322 multiState mstate; /* MULTI/EXEC state */
323 robj **blockingkeys; /* The key we are waiting to terminate a blocking
324 * operation such as BLPOP. Otherwise NULL. */
325 int blockingkeysnum; /* Number of blocking keys */
326 time_t blockingto; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
328 list *io_keys; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
330 } redisClient;
331
332 struct saveparam {
333 time_t seconds;
334 int changes;
335 };
336
337 /* Global server state structure */
338 struct redisServer {
339 int port;
340 int fd;
341 redisDb *db;
342 dict *sharingpool; /* Poll used for object sharing */
343 unsigned int sharingpoolsize;
344 long long dirty; /* changes to DB from the last save */
345 list *clients;
346 list *slaves, *monitors;
347 char neterr[ANET_ERR_LEN];
348 aeEventLoop *el;
349 int cronloops; /* number of times the cron function run */
350 list *objfreelist; /* A list of freed objects to avoid malloc() */
351 time_t lastsave; /* Unix time of last save succeeede */
352 /* Fields used only for stats */
353 time_t stat_starttime; /* server start time */
354 long long stat_numcommands; /* number of processed commands */
355 long long stat_numconnections; /* number of connections received */
356 /* Configuration */
357 int verbosity;
358 int glueoutputbuf;
359 int maxidletime;
360 int dbnum;
361 int daemonize;
362 int appendonly;
363 int appendfsync;
364 time_t lastfsync;
365 int appendfd;
366 int appendseldb;
367 char *pidfile;
368 pid_t bgsavechildpid;
369 pid_t bgrewritechildpid;
370 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
371 struct saveparam *saveparams;
372 int saveparamslen;
373 char *logfile;
374 char *bindaddr;
375 char *dbfilename;
376 char *appendfilename;
377 char *requirepass;
378 int shareobjects;
379 int rdbcompression;
380 /* Replication related */
381 int isslave;
382 char *masterauth;
383 char *masterhost;
384 int masterport;
385 redisClient *master; /* client that is master for this slave */
386 int replstate;
387 unsigned int maxclients;
388 unsigned long long maxmemory;
389 unsigned int blpop_blocked_clients;
390 unsigned int vm_blocked_clients;
391 /* Sort parameters - qsort_r() is only available under BSD so we
392 * have to take this state global, in order to pass it to sortCompare() */
393 int sort_desc;
394 int sort_alpha;
395 int sort_bypattern;
396 /* Virtual memory configuration */
397 int vm_enabled;
398 char *vm_swap_file;
399 off_t vm_page_size;
400 off_t vm_pages;
401 unsigned long long vm_max_memory;
402 /* Hashes config */
403 size_t hash_max_zipmap_entries;
404 size_t hash_max_zipmap_value;
405 /* Virtual memory state */
406 FILE *vm_fp;
407 int vm_fd;
408 off_t vm_next_page; /* Next probably empty page */
409 off_t vm_near_pages; /* Number of pages allocated sequentially */
410 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
411 time_t unixtime; /* Unix time sampled every second. */
412 /* Virtual memory I/O threads stuff */
413 /* An I/O thread process an element taken from the io_jobs queue and
414 * put the result of the operation in the io_done list. While the
415 * job is being processed, it's put on io_processing queue. */
416 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
417 list *io_processing; /* List of VM I/O jobs being processed */
418 list *io_processed; /* List of VM I/O jobs already processed */
419 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
420 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
421 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
422 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
423 pthread_attr_t io_threads_attr; /* attributes for threads creation */
424 int io_active_threads; /* Number of running I/O threads */
425 int vm_max_threads; /* Max number of I/O threads running at the same time */
426 /* Our main thread is blocked on the event loop, locking for sockets ready
427 * to be read or written, so when a threaded I/O operation is ready to be
428 * processed by the main thread, the I/O thread will use a unix pipe to
429 * awake the main thread. The followings are the two pipe FDs. */
430 int io_ready_pipe_read;
431 int io_ready_pipe_write;
432 /* Virtual memory stats */
433 unsigned long long vm_stats_used_pages;
434 unsigned long long vm_stats_swapped_objects;
435 unsigned long long vm_stats_swapouts;
436 unsigned long long vm_stats_swapins;
437 FILE *devnull;
438 };
439
440 typedef void redisCommandProc(redisClient *c);
441 struct redisCommand {
442 char *name;
443 redisCommandProc *proc;
444 int arity;
445 int flags;
446 /* What keys should be loaded in background when calling this command? */
447 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
448 int vm_lastkey; /* THe last argument that's a key */
449 int vm_keystep; /* The step between first and last key */
450 };
451
452 struct redisFunctionSym {
453 char *name;
454 unsigned long pointer;
455 };
456
457 typedef struct _redisSortObject {
458 robj *obj;
459 union {
460 double score;
461 robj *cmpobj;
462 } u;
463 } redisSortObject;
464
465 typedef struct _redisSortOperation {
466 int type;
467 robj *pattern;
468 } redisSortOperation;
469
470 /* ZSETs use a specialized version of Skiplists */
471
472 typedef struct zskiplistNode {
473 struct zskiplistNode **forward;
474 struct zskiplistNode *backward;
475 unsigned int *span;
476 double score;
477 robj *obj;
478 } zskiplistNode;
479
480 typedef struct zskiplist {
481 struct zskiplistNode *header, *tail;
482 unsigned long length;
483 int level;
484 } zskiplist;
485
486 typedef struct zset {
487 dict *dict;
488 zskiplist *zsl;
489 } zset;
490
491 /* Our shared "common" objects */
492
493 struct sharedObjectsStruct {
494 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
495 *colon, *nullbulk, *nullmultibulk, *queued,
496 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
497 *outofrangeerr, *plus,
498 *select0, *select1, *select2, *select3, *select4,
499 *select5, *select6, *select7, *select8, *select9;
500 } shared;
501
502 /* Global vars that are actally used as constants. The following double
503 * values are used for double on-disk serialization, and are initialized
504 * at runtime to avoid strange compiler optimizations. */
505
506 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
507
508 /* VM threaded I/O request message */
509 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
510 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
511 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
512 typedef struct iojob {
513 int type; /* Request type, REDIS_IOJOB_* */
514 redisDb *db;/* Redis database */
515 robj *key; /* This I/O request is about swapping this key */
516 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
517 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
518 off_t page; /* Swap page where to read/write the object */
519 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
520 int canceled; /* True if this command was canceled by blocking side of VM */
521 pthread_t thread; /* ID of the thread processing this entry */
522 } iojob;
523
524 /*================================ Prototypes =============================== */
525
526 static void freeStringObject(robj *o);
527 static void freeListObject(robj *o);
528 static void freeSetObject(robj *o);
529 static void decrRefCount(void *o);
530 static robj *createObject(int type, void *ptr);
531 static void freeClient(redisClient *c);
532 static int rdbLoad(char *filename);
533 static void addReply(redisClient *c, robj *obj);
534 static void addReplySds(redisClient *c, sds s);
535 static void incrRefCount(robj *o);
536 static int rdbSaveBackground(char *filename);
537 static robj *createStringObject(char *ptr, size_t len);
538 static robj *dupStringObject(robj *o);
539 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
540 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
541 static int syncWithMaster(void);
542 static robj *tryObjectSharing(robj *o);
543 static int tryObjectEncoding(robj *o);
544 static robj *getDecodedObject(robj *o);
545 static int removeExpire(redisDb *db, robj *key);
546 static int expireIfNeeded(redisDb *db, robj *key);
547 static int deleteIfVolatile(redisDb *db, robj *key);
548 static int deleteIfSwapped(redisDb *db, robj *key);
549 static int deleteKey(redisDb *db, robj *key);
550 static time_t getExpire(redisDb *db, robj *key);
551 static int setExpire(redisDb *db, robj *key, time_t when);
552 static void updateSlavesWaitingBgsave(int bgsaveerr);
553 static void freeMemoryIfNeeded(void);
554 static int processCommand(redisClient *c);
555 static void setupSigSegvAction(void);
556 static void rdbRemoveTempFile(pid_t childpid);
557 static void aofRemoveTempFile(pid_t childpid);
558 static size_t stringObjectLen(robj *o);
559 static void processInputBuffer(redisClient *c);
560 static zskiplist *zslCreate(void);
561 static void zslFree(zskiplist *zsl);
562 static void zslInsert(zskiplist *zsl, double score, robj *obj);
563 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
564 static void initClientMultiState(redisClient *c);
565 static void freeClientMultiState(redisClient *c);
566 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
567 static void unblockClientWaitingData(redisClient *c);
568 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
569 static void vmInit(void);
570 static void vmMarkPagesFree(off_t page, off_t count);
571 static robj *vmLoadObject(robj *key);
572 static robj *vmPreviewObject(robj *key);
573 static int vmSwapOneObjectBlocking(void);
574 static int vmSwapOneObjectThreaded(void);
575 static int vmCanSwapOut(void);
576 static int tryFreeOneObjectFromFreelist(void);
577 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
578 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
579 static void vmCancelThreadedIOJob(robj *o);
580 static void lockThreadedIO(void);
581 static void unlockThreadedIO(void);
582 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
583 static void freeIOJob(iojob *j);
584 static void queueIOJob(iojob *j);
585 static int vmWriteObjectOnSwap(robj *o, off_t page);
586 static robj *vmReadObjectFromSwap(off_t page, int type);
587 static void waitEmptyIOJobsQueue(void);
588 static void vmReopenSwapFile(void);
589 static int vmFreePage(off_t page);
590 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
591 static int dontWaitForSwappedKey(redisClient *c, robj *key);
592 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
593 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
594 static struct redisCommand *lookupCommand(char *name);
595 static void call(redisClient *c, struct redisCommand *cmd);
596 static void resetClient(redisClient *c);
597 static void convertToRealHash(robj *o);
598
599 static void authCommand(redisClient *c);
600 static void pingCommand(redisClient *c);
601 static void echoCommand(redisClient *c);
602 static void setCommand(redisClient *c);
603 static void setnxCommand(redisClient *c);
604 static void getCommand(redisClient *c);
605 static void delCommand(redisClient *c);
606 static void existsCommand(redisClient *c);
607 static void incrCommand(redisClient *c);
608 static void decrCommand(redisClient *c);
609 static void incrbyCommand(redisClient *c);
610 static void decrbyCommand(redisClient *c);
611 static void selectCommand(redisClient *c);
612 static void randomkeyCommand(redisClient *c);
613 static void keysCommand(redisClient *c);
614 static void dbsizeCommand(redisClient *c);
615 static void lastsaveCommand(redisClient *c);
616 static void saveCommand(redisClient *c);
617 static void bgsaveCommand(redisClient *c);
618 static void bgrewriteaofCommand(redisClient *c);
619 static void shutdownCommand(redisClient *c);
620 static void moveCommand(redisClient *c);
621 static void renameCommand(redisClient *c);
622 static void renamenxCommand(redisClient *c);
623 static void lpushCommand(redisClient *c);
624 static void rpushCommand(redisClient *c);
625 static void lpopCommand(redisClient *c);
626 static void rpopCommand(redisClient *c);
627 static void llenCommand(redisClient *c);
628 static void lindexCommand(redisClient *c);
629 static void lrangeCommand(redisClient *c);
630 static void ltrimCommand(redisClient *c);
631 static void typeCommand(redisClient *c);
632 static void lsetCommand(redisClient *c);
633 static void saddCommand(redisClient *c);
634 static void sremCommand(redisClient *c);
635 static void smoveCommand(redisClient *c);
636 static void sismemberCommand(redisClient *c);
637 static void scardCommand(redisClient *c);
638 static void spopCommand(redisClient *c);
639 static void srandmemberCommand(redisClient *c);
640 static void sinterCommand(redisClient *c);
641 static void sinterstoreCommand(redisClient *c);
642 static void sunionCommand(redisClient *c);
643 static void sunionstoreCommand(redisClient *c);
644 static void sdiffCommand(redisClient *c);
645 static void sdiffstoreCommand(redisClient *c);
646 static void syncCommand(redisClient *c);
647 static void flushdbCommand(redisClient *c);
648 static void flushallCommand(redisClient *c);
649 static void sortCommand(redisClient *c);
650 static void lremCommand(redisClient *c);
651 static void rpoplpushcommand(redisClient *c);
652 static void infoCommand(redisClient *c);
653 static void mgetCommand(redisClient *c);
654 static void monitorCommand(redisClient *c);
655 static void expireCommand(redisClient *c);
656 static void expireatCommand(redisClient *c);
657 static void getsetCommand(redisClient *c);
658 static void ttlCommand(redisClient *c);
659 static void slaveofCommand(redisClient *c);
660 static void debugCommand(redisClient *c);
661 static void msetCommand(redisClient *c);
662 static void msetnxCommand(redisClient *c);
663 static void zaddCommand(redisClient *c);
664 static void zincrbyCommand(redisClient *c);
665 static void zrangeCommand(redisClient *c);
666 static void zrangebyscoreCommand(redisClient *c);
667 static void zcountCommand(redisClient *c);
668 static void zrevrangeCommand(redisClient *c);
669 static void zcardCommand(redisClient *c);
670 static void zremCommand(redisClient *c);
671 static void zscoreCommand(redisClient *c);
672 static void zremrangebyscoreCommand(redisClient *c);
673 static void multiCommand(redisClient *c);
674 static void execCommand(redisClient *c);
675 static void discardCommand(redisClient *c);
676 static void blpopCommand(redisClient *c);
677 static void brpopCommand(redisClient *c);
678 static void appendCommand(redisClient *c);
679 static void substrCommand(redisClient *c);
680 static void zrankCommand(redisClient *c);
681 static void zrevrankCommand(redisClient *c);
682 static void hsetCommand(redisClient *c);
683 static void hgetCommand(redisClient *c);
684 static void hdelCommand(redisClient *c);
685 static void hlenCommand(redisClient *c);
686 static void zremrangebyrankCommand(redisClient *c);
687 static void zunionCommand(redisClient *c);
688 static void zinterCommand(redisClient *c);
689
690 /*================================= Globals ================================= */
691
692 /* Global vars */
693 static struct redisServer server; /* server global state */
694 static struct redisCommand cmdTable[] = {
695 {"get",getCommand,2,REDIS_CMD_INLINE,1,1,1},
696 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
697 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,0,0,0},
698 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
699 {"substr",substrCommand,4,REDIS_CMD_INLINE,1,1,1},
700 {"del",delCommand,-2,REDIS_CMD_INLINE,0,0,0},
701 {"exists",existsCommand,2,REDIS_CMD_INLINE,1,1,1},
702 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
703 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
704 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,1,-1,1},
705 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
706 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
707 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,1,1,1},
708 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,1,1,1},
709 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
710 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,1,1,1},
711 {"llen",llenCommand,2,REDIS_CMD_INLINE,1,1,1},
712 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,1,1,1},
713 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
714 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,1,1,1},
715 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,1,1,1},
716 {"lrem",lremCommand,4,REDIS_CMD_BULK,1,1,1},
717 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,2,1},
718 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
719 {"srem",sremCommand,3,REDIS_CMD_BULK,1,1,1},
720 {"smove",smoveCommand,4,REDIS_CMD_BULK,1,2,1},
721 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,1,1,1},
722 {"scard",scardCommand,2,REDIS_CMD_INLINE,1,1,1},
723 {"spop",spopCommand,2,REDIS_CMD_INLINE,1,1,1},
724 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,1,1,1},
725 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
726 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
727 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
728 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
729 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,-1,1},
730 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,2,-1,1},
731 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,1,1,1},
732 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
733 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
734 {"zrem",zremCommand,3,REDIS_CMD_BULK,1,1,1},
735 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,1,1,1},
736 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,1,1,1},
737 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,0,0,0},
738 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,0,0,0},
739 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
740 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,1,1,1},
741 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,1,1,1},
742 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,1,1,1},
743 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,1,1,1},
744 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
745 {"zrank",zrankCommand,3,REDIS_CMD_BULK,1,1,1},
746 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,1,1,1},
747 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
748 {"hget",hgetCommand,3,REDIS_CMD_BULK,1,1,1},
749 {"hdel",hdelCommand,3,REDIS_CMD_BULK,1,1,1},
750 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,1,1,1},
751 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
752 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
753 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,1,1},
754 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
755 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,1,-1,2},
756 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,0,0,0},
757 {"select",selectCommand,2,REDIS_CMD_INLINE,0,0,0},
758 {"move",moveCommand,3,REDIS_CMD_INLINE,1,1,1},
759 {"rename",renameCommand,3,REDIS_CMD_INLINE,1,1,1},
760 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,1,1,1},
761 {"expire",expireCommand,3,REDIS_CMD_INLINE,0,0,0},
762 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,0,0,0},
763 {"keys",keysCommand,2,REDIS_CMD_INLINE,0,0,0},
764 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,0,0,0},
765 {"auth",authCommand,2,REDIS_CMD_INLINE,0,0,0},
766 {"ping",pingCommand,1,REDIS_CMD_INLINE,0,0,0},
767 {"echo",echoCommand,2,REDIS_CMD_BULK,0,0,0},
768 {"save",saveCommand,1,REDIS_CMD_INLINE,0,0,0},
769 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
770 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,0,0,0},
771 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,0,0,0},
772 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,0,0,0},
773 {"type",typeCommand,2,REDIS_CMD_INLINE,1,1,1},
774 {"multi",multiCommand,1,REDIS_CMD_INLINE,0,0,0},
775 {"exec",execCommand,1,REDIS_CMD_INLINE,0,0,0},
776 {"discard",discardCommand,1,REDIS_CMD_INLINE,0,0,0},
777 {"sync",syncCommand,1,REDIS_CMD_INLINE,0,0,0},
778 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,0,0,0},
779 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,0,0,0},
780 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,1,1,1},
781 {"info",infoCommand,1,REDIS_CMD_INLINE,0,0,0},
782 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,0,0,0},
783 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,1,1,1},
784 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,0,0,0},
785 {"debug",debugCommand,-2,REDIS_CMD_INLINE,0,0,0},
786 {NULL,NULL,0,0,0,0,0}
787 };
788
789 /*============================ Utility functions ============================ */
790
791 /* Glob-style pattern matching. */
792 int stringmatchlen(const char *pattern, int patternLen,
793 const char *string, int stringLen, int nocase)
794 {
795 while(patternLen) {
796 switch(pattern[0]) {
797 case '*':
798 while (pattern[1] == '*') {
799 pattern++;
800 patternLen--;
801 }
802 if (patternLen == 1)
803 return 1; /* match */
804 while(stringLen) {
805 if (stringmatchlen(pattern+1, patternLen-1,
806 string, stringLen, nocase))
807 return 1; /* match */
808 string++;
809 stringLen--;
810 }
811 return 0; /* no match */
812 break;
813 case '?':
814 if (stringLen == 0)
815 return 0; /* no match */
816 string++;
817 stringLen--;
818 break;
819 case '[':
820 {
821 int not, match;
822
823 pattern++;
824 patternLen--;
825 not = pattern[0] == '^';
826 if (not) {
827 pattern++;
828 patternLen--;
829 }
830 match = 0;
831 while(1) {
832 if (pattern[0] == '\\') {
833 pattern++;
834 patternLen--;
835 if (pattern[0] == string[0])
836 match = 1;
837 } else if (pattern[0] == ']') {
838 break;
839 } else if (patternLen == 0) {
840 pattern--;
841 patternLen++;
842 break;
843 } else if (pattern[1] == '-' && patternLen >= 3) {
844 int start = pattern[0];
845 int end = pattern[2];
846 int c = string[0];
847 if (start > end) {
848 int t = start;
849 start = end;
850 end = t;
851 }
852 if (nocase) {
853 start = tolower(start);
854 end = tolower(end);
855 c = tolower(c);
856 }
857 pattern += 2;
858 patternLen -= 2;
859 if (c >= start && c <= end)
860 match = 1;
861 } else {
862 if (!nocase) {
863 if (pattern[0] == string[0])
864 match = 1;
865 } else {
866 if (tolower((int)pattern[0]) == tolower((int)string[0]))
867 match = 1;
868 }
869 }
870 pattern++;
871 patternLen--;
872 }
873 if (not)
874 match = !match;
875 if (!match)
876 return 0; /* no match */
877 string++;
878 stringLen--;
879 break;
880 }
881 case '\\':
882 if (patternLen >= 2) {
883 pattern++;
884 patternLen--;
885 }
886 /* fall through */
887 default:
888 if (!nocase) {
889 if (pattern[0] != string[0])
890 return 0; /* no match */
891 } else {
892 if (tolower((int)pattern[0]) != tolower((int)string[0]))
893 return 0; /* no match */
894 }
895 string++;
896 stringLen--;
897 break;
898 }
899 pattern++;
900 patternLen--;
901 if (stringLen == 0) {
902 while(*pattern == '*') {
903 pattern++;
904 patternLen--;
905 }
906 break;
907 }
908 }
909 if (patternLen == 0 && stringLen == 0)
910 return 1;
911 return 0;
912 }
913
914 static void redisLog(int level, const char *fmt, ...) {
915 va_list ap;
916 FILE *fp;
917
918 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
919 if (!fp) return;
920
921 va_start(ap, fmt);
922 if (level >= server.verbosity) {
923 char *c = ".-*#";
924 char buf[64];
925 time_t now;
926
927 now = time(NULL);
928 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
929 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
930 vfprintf(fp, fmt, ap);
931 fprintf(fp,"\n");
932 fflush(fp);
933 }
934 va_end(ap);
935
936 if (server.logfile) fclose(fp);
937 }
938
939 /*====================== Hash table type implementation ==================== */
940
941 /* This is an hash table type that uses the SDS dynamic strings libary as
942 * keys and radis objects as values (objects can hold SDS strings,
943 * lists, sets). */
944
945 static void dictVanillaFree(void *privdata, void *val)
946 {
947 DICT_NOTUSED(privdata);
948 zfree(val);
949 }
950
951 static void dictListDestructor(void *privdata, void *val)
952 {
953 DICT_NOTUSED(privdata);
954 listRelease((list*)val);
955 }
956
957 static int sdsDictKeyCompare(void *privdata, const void *key1,
958 const void *key2)
959 {
960 int l1,l2;
961 DICT_NOTUSED(privdata);
962
963 l1 = sdslen((sds)key1);
964 l2 = sdslen((sds)key2);
965 if (l1 != l2) return 0;
966 return memcmp(key1, key2, l1) == 0;
967 }
968
969 static void dictRedisObjectDestructor(void *privdata, void *val)
970 {
971 DICT_NOTUSED(privdata);
972
973 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
974 decrRefCount(val);
975 }
976
977 static int dictObjKeyCompare(void *privdata, const void *key1,
978 const void *key2)
979 {
980 const robj *o1 = key1, *o2 = key2;
981 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
982 }
983
984 static unsigned int dictObjHash(const void *key) {
985 const robj *o = key;
986 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
987 }
988
989 static int dictEncObjKeyCompare(void *privdata, const void *key1,
990 const void *key2)
991 {
992 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
993 int cmp;
994
995 o1 = getDecodedObject(o1);
996 o2 = getDecodedObject(o2);
997 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
998 decrRefCount(o1);
999 decrRefCount(o2);
1000 return cmp;
1001 }
1002
1003 static unsigned int dictEncObjHash(const void *key) {
1004 robj *o = (robj*) key;
1005
1006 if (o->encoding == REDIS_ENCODING_RAW) {
1007 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1008 } else {
1009 if (o->encoding == REDIS_ENCODING_INT) {
1010 char buf[32];
1011 int len;
1012
1013 len = snprintf(buf,32,"%ld",(long)o->ptr);
1014 return dictGenHashFunction((unsigned char*)buf, len);
1015 } else {
1016 unsigned int hash;
1017
1018 o = getDecodedObject(o);
1019 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1020 decrRefCount(o);
1021 return hash;
1022 }
1023 }
1024 }
1025
1026 /* Sets type and expires */
1027 static dictType setDictType = {
1028 dictEncObjHash, /* hash function */
1029 NULL, /* key dup */
1030 NULL, /* val dup */
1031 dictEncObjKeyCompare, /* key compare */
1032 dictRedisObjectDestructor, /* key destructor */
1033 NULL /* val destructor */
1034 };
1035
1036 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1037 static dictType zsetDictType = {
1038 dictEncObjHash, /* hash function */
1039 NULL, /* key dup */
1040 NULL, /* val dup */
1041 dictEncObjKeyCompare, /* key compare */
1042 dictRedisObjectDestructor, /* key destructor */
1043 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1044 };
1045
1046 /* Db->dict */
1047 static dictType dbDictType = {
1048 dictObjHash, /* hash function */
1049 NULL, /* key dup */
1050 NULL, /* val dup */
1051 dictObjKeyCompare, /* key compare */
1052 dictRedisObjectDestructor, /* key destructor */
1053 dictRedisObjectDestructor /* val destructor */
1054 };
1055
1056 /* Db->expires */
1057 static dictType keyptrDictType = {
1058 dictObjHash, /* hash function */
1059 NULL, /* key dup */
1060 NULL, /* val dup */
1061 dictObjKeyCompare, /* key compare */
1062 dictRedisObjectDestructor, /* key destructor */
1063 NULL /* val destructor */
1064 };
1065
1066 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1067 static dictType hashDictType = {
1068 dictEncObjHash, /* hash function */
1069 NULL, /* key dup */
1070 NULL, /* val dup */
1071 dictEncObjKeyCompare, /* key compare */
1072 dictRedisObjectDestructor, /* key destructor */
1073 dictRedisObjectDestructor /* val destructor */
1074 };
1075
1076 /* Keylist hash table type has unencoded redis objects as keys and
1077 * lists as values. It's used for blocking operations (BLPOP) and to
1078 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1079 static dictType keylistDictType = {
1080 dictObjHash, /* hash function */
1081 NULL, /* key dup */
1082 NULL, /* val dup */
1083 dictObjKeyCompare, /* key compare */
1084 dictRedisObjectDestructor, /* key destructor */
1085 dictListDestructor /* val destructor */
1086 };
1087
1088 /* ========================= Random utility functions ======================= */
1089
1090 /* Redis generally does not try to recover from out of memory conditions
1091 * when allocating objects or strings, it is not clear if it will be possible
1092 * to report this condition to the client since the networking layer itself
1093 * is based on heap allocation for send buffers, so we simply abort.
1094 * At least the code will be simpler to read... */
1095 static void oom(const char *msg) {
1096 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1097 sleep(1);
1098 abort();
1099 }
1100
1101 /* ====================== Redis server networking stuff ===================== */
1102 static void closeTimedoutClients(void) {
1103 redisClient *c;
1104 listNode *ln;
1105 time_t now = time(NULL);
1106 listIter li;
1107
1108 listRewind(server.clients,&li);
1109 while ((ln = listNext(&li)) != NULL) {
1110 c = listNodeValue(ln);
1111 if (server.maxidletime &&
1112 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1113 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1114 (now - c->lastinteraction > server.maxidletime))
1115 {
1116 redisLog(REDIS_VERBOSE,"Closing idle client");
1117 freeClient(c);
1118 } else if (c->flags & REDIS_BLOCKED) {
1119 if (c->blockingto != 0 && c->blockingto < now) {
1120 addReply(c,shared.nullmultibulk);
1121 unblockClientWaitingData(c);
1122 }
1123 }
1124 }
1125 }
1126
1127 static int htNeedsResize(dict *dict) {
1128 long long size, used;
1129
1130 size = dictSlots(dict);
1131 used = dictSize(dict);
1132 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1133 (used*100/size < REDIS_HT_MINFILL));
1134 }
1135
1136 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1137 * we resize the hash table to save memory */
1138 static void tryResizeHashTables(void) {
1139 int j;
1140
1141 for (j = 0; j < server.dbnum; j++) {
1142 if (htNeedsResize(server.db[j].dict)) {
1143 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1144 dictResize(server.db[j].dict);
1145 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1146 }
1147 if (htNeedsResize(server.db[j].expires))
1148 dictResize(server.db[j].expires);
1149 }
1150 }
1151
1152 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1153 void backgroundSaveDoneHandler(int statloc) {
1154 int exitcode = WEXITSTATUS(statloc);
1155 int bysignal = WIFSIGNALED(statloc);
1156
1157 if (!bysignal && exitcode == 0) {
1158 redisLog(REDIS_NOTICE,
1159 "Background saving terminated with success");
1160 server.dirty = 0;
1161 server.lastsave = time(NULL);
1162 } else if (!bysignal && exitcode != 0) {
1163 redisLog(REDIS_WARNING, "Background saving error");
1164 } else {
1165 redisLog(REDIS_WARNING,
1166 "Background saving terminated by signal");
1167 rdbRemoveTempFile(server.bgsavechildpid);
1168 }
1169 server.bgsavechildpid = -1;
1170 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1171 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1172 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1173 }
1174
1175 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1176 * Handle this. */
1177 void backgroundRewriteDoneHandler(int statloc) {
1178 int exitcode = WEXITSTATUS(statloc);
1179 int bysignal = WIFSIGNALED(statloc);
1180
1181 if (!bysignal && exitcode == 0) {
1182 int fd;
1183 char tmpfile[256];
1184
1185 redisLog(REDIS_NOTICE,
1186 "Background append only file rewriting terminated with success");
1187 /* Now it's time to flush the differences accumulated by the parent */
1188 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1189 fd = open(tmpfile,O_WRONLY|O_APPEND);
1190 if (fd == -1) {
1191 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1192 goto cleanup;
1193 }
1194 /* Flush our data... */
1195 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1196 (signed) sdslen(server.bgrewritebuf)) {
1197 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1198 close(fd);
1199 goto cleanup;
1200 }
1201 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1202 /* Now our work is to rename the temp file into the stable file. And
1203 * switch the file descriptor used by the server for append only. */
1204 if (rename(tmpfile,server.appendfilename) == -1) {
1205 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1206 close(fd);
1207 goto cleanup;
1208 }
1209 /* Mission completed... almost */
1210 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1211 if (server.appendfd != -1) {
1212 /* If append only is actually enabled... */
1213 close(server.appendfd);
1214 server.appendfd = fd;
1215 fsync(fd);
1216 server.appendseldb = -1; /* Make sure it will issue SELECT */
1217 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1218 } else {
1219 /* If append only is disabled we just generate a dump in this
1220 * format. Why not? */
1221 close(fd);
1222 }
1223 } else if (!bysignal && exitcode != 0) {
1224 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1225 } else {
1226 redisLog(REDIS_WARNING,
1227 "Background append only file rewriting terminated by signal");
1228 }
1229 cleanup:
1230 sdsfree(server.bgrewritebuf);
1231 server.bgrewritebuf = sdsempty();
1232 aofRemoveTempFile(server.bgrewritechildpid);
1233 server.bgrewritechildpid = -1;
1234 }
1235
1236 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1237 int j, loops = server.cronloops++;
1238 REDIS_NOTUSED(eventLoop);
1239 REDIS_NOTUSED(id);
1240 REDIS_NOTUSED(clientData);
1241
1242 /* We take a cached value of the unix time in the global state because
1243 * with virtual memory and aging there is to store the current time
1244 * in objects at every object access, and accuracy is not needed.
1245 * To access a global var is faster than calling time(NULL) */
1246 server.unixtime = time(NULL);
1247
1248 /* Show some info about non-empty databases */
1249 for (j = 0; j < server.dbnum; j++) {
1250 long long size, used, vkeys;
1251
1252 size = dictSlots(server.db[j].dict);
1253 used = dictSize(server.db[j].dict);
1254 vkeys = dictSize(server.db[j].expires);
1255 if (!(loops % 5) && (used || vkeys)) {
1256 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1257 /* dictPrintStats(server.dict); */
1258 }
1259 }
1260
1261 /* We don't want to resize the hash tables while a bacground saving
1262 * is in progress: the saving child is created using fork() that is
1263 * implemented with a copy-on-write semantic in most modern systems, so
1264 * if we resize the HT while there is the saving child at work actually
1265 * a lot of memory movements in the parent will cause a lot of pages
1266 * copied. */
1267 if (server.bgsavechildpid == -1) tryResizeHashTables();
1268
1269 /* Show information about connected clients */
1270 if (!(loops % 5)) {
1271 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1272 listLength(server.clients)-listLength(server.slaves),
1273 listLength(server.slaves),
1274 zmalloc_used_memory(),
1275 dictSize(server.sharingpool));
1276 }
1277
1278 /* Close connections of timedout clients */
1279 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
1280 closeTimedoutClients();
1281
1282 /* Check if a background saving or AOF rewrite in progress terminated */
1283 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1284 int statloc;
1285 pid_t pid;
1286
1287 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1288 if (pid == server.bgsavechildpid) {
1289 backgroundSaveDoneHandler(statloc);
1290 } else {
1291 backgroundRewriteDoneHandler(statloc);
1292 }
1293 }
1294 } else {
1295 /* If there is not a background saving in progress check if
1296 * we have to save now */
1297 time_t now = time(NULL);
1298 for (j = 0; j < server.saveparamslen; j++) {
1299 struct saveparam *sp = server.saveparams+j;
1300
1301 if (server.dirty >= sp->changes &&
1302 now-server.lastsave > sp->seconds) {
1303 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1304 sp->changes, sp->seconds);
1305 rdbSaveBackground(server.dbfilename);
1306 break;
1307 }
1308 }
1309 }
1310
1311 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1312 * will use few CPU cycles if there are few expiring keys, otherwise
1313 * it will get more aggressive to avoid that too much memory is used by
1314 * keys that can be removed from the keyspace. */
1315 for (j = 0; j < server.dbnum; j++) {
1316 int expired;
1317 redisDb *db = server.db+j;
1318
1319 /* Continue to expire if at the end of the cycle more than 25%
1320 * of the keys were expired. */
1321 do {
1322 long num = dictSize(db->expires);
1323 time_t now = time(NULL);
1324
1325 expired = 0;
1326 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1327 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1328 while (num--) {
1329 dictEntry *de;
1330 time_t t;
1331
1332 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1333 t = (time_t) dictGetEntryVal(de);
1334 if (now > t) {
1335 deleteKey(db,dictGetEntryKey(de));
1336 expired++;
1337 }
1338 }
1339 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1340 }
1341
1342 /* Swap a few keys on disk if we are over the memory limit and VM
1343 * is enbled. Try to free objects from the free list first. */
1344 if (vmCanSwapOut()) {
1345 while (server.vm_enabled && zmalloc_used_memory() >
1346 server.vm_max_memory)
1347 {
1348 int retval;
1349
1350 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1351 retval = (server.vm_max_threads == 0) ?
1352 vmSwapOneObjectBlocking() :
1353 vmSwapOneObjectThreaded();
1354 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1355 zmalloc_used_memory() >
1356 (server.vm_max_memory+server.vm_max_memory/10))
1357 {
1358 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1359 }
1360 /* Note that when using threade I/O we free just one object,
1361 * because anyway when the I/O thread in charge to swap this
1362 * object out will finish, the handler of completed jobs
1363 * will try to swap more objects if we are still out of memory. */
1364 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1365 }
1366 }
1367
1368 /* Check if we should connect to a MASTER */
1369 if (server.replstate == REDIS_REPL_CONNECT) {
1370 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1371 if (syncWithMaster() == REDIS_OK) {
1372 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1373 }
1374 }
1375 return 1000;
1376 }
1377
1378 /* This function gets called every time Redis is entering the
1379 * main loop of the event driven library, that is, before to sleep
1380 * for ready file descriptors. */
1381 static void beforeSleep(struct aeEventLoop *eventLoop) {
1382 REDIS_NOTUSED(eventLoop);
1383
1384 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1385 listIter li;
1386 listNode *ln;
1387
1388 listRewind(server.io_ready_clients,&li);
1389 while((ln = listNext(&li))) {
1390 redisClient *c = ln->value;
1391 struct redisCommand *cmd;
1392
1393 /* Resume the client. */
1394 listDelNode(server.io_ready_clients,ln);
1395 c->flags &= (~REDIS_IO_WAIT);
1396 server.vm_blocked_clients--;
1397 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1398 readQueryFromClient, c);
1399 cmd = lookupCommand(c->argv[0]->ptr);
1400 assert(cmd != NULL);
1401 call(c,cmd);
1402 resetClient(c);
1403 /* There may be more data to process in the input buffer. */
1404 if (c->querybuf && sdslen(c->querybuf) > 0)
1405 processInputBuffer(c);
1406 }
1407 }
1408 }
1409
1410 static void createSharedObjects(void) {
1411 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1412 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1413 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1414 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1415 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1416 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1417 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1418 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1419 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1420 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1421 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1422 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1423 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1424 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1425 "-ERR no such key\r\n"));
1426 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1427 "-ERR syntax error\r\n"));
1428 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1429 "-ERR source and destination objects are the same\r\n"));
1430 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1431 "-ERR index out of range\r\n"));
1432 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1433 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1434 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1435 shared.select0 = createStringObject("select 0\r\n",10);
1436 shared.select1 = createStringObject("select 1\r\n",10);
1437 shared.select2 = createStringObject("select 2\r\n",10);
1438 shared.select3 = createStringObject("select 3\r\n",10);
1439 shared.select4 = createStringObject("select 4\r\n",10);
1440 shared.select5 = createStringObject("select 5\r\n",10);
1441 shared.select6 = createStringObject("select 6\r\n",10);
1442 shared.select7 = createStringObject("select 7\r\n",10);
1443 shared.select8 = createStringObject("select 8\r\n",10);
1444 shared.select9 = createStringObject("select 9\r\n",10);
1445 }
1446
1447 static void appendServerSaveParams(time_t seconds, int changes) {
1448 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1449 server.saveparams[server.saveparamslen].seconds = seconds;
1450 server.saveparams[server.saveparamslen].changes = changes;
1451 server.saveparamslen++;
1452 }
1453
1454 static void resetServerSaveParams() {
1455 zfree(server.saveparams);
1456 server.saveparams = NULL;
1457 server.saveparamslen = 0;
1458 }
1459
1460 static void initServerConfig() {
1461 server.dbnum = REDIS_DEFAULT_DBNUM;
1462 server.port = REDIS_SERVERPORT;
1463 server.verbosity = REDIS_VERBOSE;
1464 server.maxidletime = REDIS_MAXIDLETIME;
1465 server.saveparams = NULL;
1466 server.logfile = NULL; /* NULL = log on standard output */
1467 server.bindaddr = NULL;
1468 server.glueoutputbuf = 1;
1469 server.daemonize = 0;
1470 server.appendonly = 0;
1471 server.appendfsync = APPENDFSYNC_ALWAYS;
1472 server.lastfsync = time(NULL);
1473 server.appendfd = -1;
1474 server.appendseldb = -1; /* Make sure the first time will not match */
1475 server.pidfile = "/var/run/redis.pid";
1476 server.dbfilename = "dump.rdb";
1477 server.appendfilename = "appendonly.aof";
1478 server.requirepass = NULL;
1479 server.shareobjects = 0;
1480 server.rdbcompression = 1;
1481 server.sharingpoolsize = 1024;
1482 server.maxclients = 0;
1483 server.blpop_blocked_clients = 0;
1484 server.maxmemory = 0;
1485 server.vm_enabled = 0;
1486 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1487 server.vm_page_size = 256; /* 256 bytes per page */
1488 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1489 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1490 server.vm_max_threads = 4;
1491 server.vm_blocked_clients = 0;
1492 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1493 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1494
1495 resetServerSaveParams();
1496
1497 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1498 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1499 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1500 /* Replication related */
1501 server.isslave = 0;
1502 server.masterauth = NULL;
1503 server.masterhost = NULL;
1504 server.masterport = 6379;
1505 server.master = NULL;
1506 server.replstate = REDIS_REPL_NONE;
1507
1508 /* Double constants initialization */
1509 R_Zero = 0.0;
1510 R_PosInf = 1.0/R_Zero;
1511 R_NegInf = -1.0/R_Zero;
1512 R_Nan = R_Zero/R_Zero;
1513 }
1514
1515 static void initServer() {
1516 int j;
1517
1518 signal(SIGHUP, SIG_IGN);
1519 signal(SIGPIPE, SIG_IGN);
1520 setupSigSegvAction();
1521
1522 server.devnull = fopen("/dev/null","w");
1523 if (server.devnull == NULL) {
1524 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1525 exit(1);
1526 }
1527 server.clients = listCreate();
1528 server.slaves = listCreate();
1529 server.monitors = listCreate();
1530 server.objfreelist = listCreate();
1531 createSharedObjects();
1532 server.el = aeCreateEventLoop();
1533 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1534 server.sharingpool = dictCreate(&setDictType,NULL);
1535 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1536 if (server.fd == -1) {
1537 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1538 exit(1);
1539 }
1540 for (j = 0; j < server.dbnum; j++) {
1541 server.db[j].dict = dictCreate(&dbDictType,NULL);
1542 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1543 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1544 if (server.vm_enabled)
1545 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1546 server.db[j].id = j;
1547 }
1548 server.cronloops = 0;
1549 server.bgsavechildpid = -1;
1550 server.bgrewritechildpid = -1;
1551 server.bgrewritebuf = sdsempty();
1552 server.lastsave = time(NULL);
1553 server.dirty = 0;
1554 server.stat_numcommands = 0;
1555 server.stat_numconnections = 0;
1556 server.stat_starttime = time(NULL);
1557 server.unixtime = time(NULL);
1558 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1559 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1560 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1561
1562 if (server.appendonly) {
1563 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1564 if (server.appendfd == -1) {
1565 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1566 strerror(errno));
1567 exit(1);
1568 }
1569 }
1570
1571 if (server.vm_enabled) vmInit();
1572 }
1573
1574 /* Empty the whole database */
1575 static long long emptyDb() {
1576 int j;
1577 long long removed = 0;
1578
1579 for (j = 0; j < server.dbnum; j++) {
1580 removed += dictSize(server.db[j].dict);
1581 dictEmpty(server.db[j].dict);
1582 dictEmpty(server.db[j].expires);
1583 }
1584 return removed;
1585 }
1586
1587 static int yesnotoi(char *s) {
1588 if (!strcasecmp(s,"yes")) return 1;
1589 else if (!strcasecmp(s,"no")) return 0;
1590 else return -1;
1591 }
1592
1593 /* I agree, this is a very rudimental way to load a configuration...
1594 will improve later if the config gets more complex */
1595 static void loadServerConfig(char *filename) {
1596 FILE *fp;
1597 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1598 int linenum = 0;
1599 sds line = NULL;
1600
1601 if (filename[0] == '-' && filename[1] == '\0')
1602 fp = stdin;
1603 else {
1604 if ((fp = fopen(filename,"r")) == NULL) {
1605 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1606 exit(1);
1607 }
1608 }
1609
1610 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1611 sds *argv;
1612 int argc, j;
1613
1614 linenum++;
1615 line = sdsnew(buf);
1616 line = sdstrim(line," \t\r\n");
1617
1618 /* Skip comments and blank lines*/
1619 if (line[0] == '#' || line[0] == '\0') {
1620 sdsfree(line);
1621 continue;
1622 }
1623
1624 /* Split into arguments */
1625 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1626 sdstolower(argv[0]);
1627
1628 /* Execute config directives */
1629 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1630 server.maxidletime = atoi(argv[1]);
1631 if (server.maxidletime < 0) {
1632 err = "Invalid timeout value"; goto loaderr;
1633 }
1634 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1635 server.port = atoi(argv[1]);
1636 if (server.port < 1 || server.port > 65535) {
1637 err = "Invalid port"; goto loaderr;
1638 }
1639 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1640 server.bindaddr = zstrdup(argv[1]);
1641 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1642 int seconds = atoi(argv[1]);
1643 int changes = atoi(argv[2]);
1644 if (seconds < 1 || changes < 0) {
1645 err = "Invalid save parameters"; goto loaderr;
1646 }
1647 appendServerSaveParams(seconds,changes);
1648 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1649 if (chdir(argv[1]) == -1) {
1650 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1651 argv[1], strerror(errno));
1652 exit(1);
1653 }
1654 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1655 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1656 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1657 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1658 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1659 else {
1660 err = "Invalid log level. Must be one of debug, notice, warning";
1661 goto loaderr;
1662 }
1663 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1664 FILE *logfp;
1665
1666 server.logfile = zstrdup(argv[1]);
1667 if (!strcasecmp(server.logfile,"stdout")) {
1668 zfree(server.logfile);
1669 server.logfile = NULL;
1670 }
1671 if (server.logfile) {
1672 /* Test if we are able to open the file. The server will not
1673 * be able to abort just for this problem later... */
1674 logfp = fopen(server.logfile,"a");
1675 if (logfp == NULL) {
1676 err = sdscatprintf(sdsempty(),
1677 "Can't open the log file: %s", strerror(errno));
1678 goto loaderr;
1679 }
1680 fclose(logfp);
1681 }
1682 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1683 server.dbnum = atoi(argv[1]);
1684 if (server.dbnum < 1) {
1685 err = "Invalid number of databases"; goto loaderr;
1686 }
1687 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1688 server.maxclients = atoi(argv[1]);
1689 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1690 server.maxmemory = strtoll(argv[1], NULL, 10);
1691 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1692 server.masterhost = sdsnew(argv[1]);
1693 server.masterport = atoi(argv[2]);
1694 server.replstate = REDIS_REPL_CONNECT;
1695 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1696 server.masterauth = zstrdup(argv[1]);
1697 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1698 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1699 err = "argument must be 'yes' or 'no'"; goto loaderr;
1700 }
1701 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1702 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1703 err = "argument must be 'yes' or 'no'"; goto loaderr;
1704 }
1705 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1706 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1707 err = "argument must be 'yes' or 'no'"; goto loaderr;
1708 }
1709 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1710 server.sharingpoolsize = atoi(argv[1]);
1711 if (server.sharingpoolsize < 1) {
1712 err = "invalid object sharing pool size"; goto loaderr;
1713 }
1714 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1715 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1716 err = "argument must be 'yes' or 'no'"; goto loaderr;
1717 }
1718 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1719 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1720 err = "argument must be 'yes' or 'no'"; goto loaderr;
1721 }
1722 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1723 if (!strcasecmp(argv[1],"no")) {
1724 server.appendfsync = APPENDFSYNC_NO;
1725 } else if (!strcasecmp(argv[1],"always")) {
1726 server.appendfsync = APPENDFSYNC_ALWAYS;
1727 } else if (!strcasecmp(argv[1],"everysec")) {
1728 server.appendfsync = APPENDFSYNC_EVERYSEC;
1729 } else {
1730 err = "argument must be 'no', 'always' or 'everysec'";
1731 goto loaderr;
1732 }
1733 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1734 server.requirepass = zstrdup(argv[1]);
1735 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1736 server.pidfile = zstrdup(argv[1]);
1737 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1738 server.dbfilename = zstrdup(argv[1]);
1739 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1740 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1741 err = "argument must be 'yes' or 'no'"; goto loaderr;
1742 }
1743 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1744 zfree(server.vm_swap_file);
1745 server.vm_swap_file = zstrdup(argv[1]);
1746 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1747 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1748 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1749 server.vm_page_size = strtoll(argv[1], NULL, 10);
1750 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1751 server.vm_pages = strtoll(argv[1], NULL, 10);
1752 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1753 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1754 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1755 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1756 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1757 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1758 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1759 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1760 } else {
1761 err = "Bad directive or wrong number of arguments"; goto loaderr;
1762 }
1763 for (j = 0; j < argc; j++)
1764 sdsfree(argv[j]);
1765 zfree(argv);
1766 sdsfree(line);
1767 }
1768 if (fp != stdin) fclose(fp);
1769 return;
1770
1771 loaderr:
1772 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1773 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1774 fprintf(stderr, ">>> '%s'\n", line);
1775 fprintf(stderr, "%s\n", err);
1776 exit(1);
1777 }
1778
1779 static void freeClientArgv(redisClient *c) {
1780 int j;
1781
1782 for (j = 0; j < c->argc; j++)
1783 decrRefCount(c->argv[j]);
1784 for (j = 0; j < c->mbargc; j++)
1785 decrRefCount(c->mbargv[j]);
1786 c->argc = 0;
1787 c->mbargc = 0;
1788 }
1789
1790 static void freeClient(redisClient *c) {
1791 listNode *ln;
1792
1793 /* Note that if the client we are freeing is blocked into a blocking
1794 * call, we have to set querybuf to NULL *before* to call
1795 * unblockClientWaitingData() to avoid processInputBuffer() will get
1796 * called. Also it is important to remove the file events after
1797 * this, because this call adds the READABLE event. */
1798 sdsfree(c->querybuf);
1799 c->querybuf = NULL;
1800 if (c->flags & REDIS_BLOCKED)
1801 unblockClientWaitingData(c);
1802
1803 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1804 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1805 listRelease(c->reply);
1806 freeClientArgv(c);
1807 close(c->fd);
1808 /* Remove from the list of clients */
1809 ln = listSearchKey(server.clients,c);
1810 redisAssert(ln != NULL);
1811 listDelNode(server.clients,ln);
1812 /* Remove from the list of clients waiting for swapped keys */
1813 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1814 ln = listSearchKey(server.io_ready_clients,c);
1815 if (ln) {
1816 listDelNode(server.io_ready_clients,ln);
1817 server.vm_blocked_clients--;
1818 }
1819 }
1820 while (server.vm_enabled && listLength(c->io_keys)) {
1821 ln = listFirst(c->io_keys);
1822 dontWaitForSwappedKey(c,ln->value);
1823 }
1824 listRelease(c->io_keys);
1825 /* Other cleanup */
1826 if (c->flags & REDIS_SLAVE) {
1827 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1828 close(c->repldbfd);
1829 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1830 ln = listSearchKey(l,c);
1831 redisAssert(ln != NULL);
1832 listDelNode(l,ln);
1833 }
1834 if (c->flags & REDIS_MASTER) {
1835 server.master = NULL;
1836 server.replstate = REDIS_REPL_CONNECT;
1837 }
1838 zfree(c->argv);
1839 zfree(c->mbargv);
1840 freeClientMultiState(c);
1841 zfree(c);
1842 }
1843
1844 #define GLUEREPLY_UP_TO (1024)
1845 static void glueReplyBuffersIfNeeded(redisClient *c) {
1846 int copylen = 0;
1847 char buf[GLUEREPLY_UP_TO];
1848 listNode *ln;
1849 listIter li;
1850 robj *o;
1851
1852 listRewind(c->reply,&li);
1853 while((ln = listNext(&li))) {
1854 int objlen;
1855
1856 o = ln->value;
1857 objlen = sdslen(o->ptr);
1858 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1859 memcpy(buf+copylen,o->ptr,objlen);
1860 copylen += objlen;
1861 listDelNode(c->reply,ln);
1862 } else {
1863 if (copylen == 0) return;
1864 break;
1865 }
1866 }
1867 /* Now the output buffer is empty, add the new single element */
1868 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1869 listAddNodeHead(c->reply,o);
1870 }
1871
1872 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1873 redisClient *c = privdata;
1874 int nwritten = 0, totwritten = 0, objlen;
1875 robj *o;
1876 REDIS_NOTUSED(el);
1877 REDIS_NOTUSED(mask);
1878
1879 /* Use writev() if we have enough buffers to send */
1880 if (!server.glueoutputbuf &&
1881 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1882 !(c->flags & REDIS_MASTER))
1883 {
1884 sendReplyToClientWritev(el, fd, privdata, mask);
1885 return;
1886 }
1887
1888 while(listLength(c->reply)) {
1889 if (server.glueoutputbuf && listLength(c->reply) > 1)
1890 glueReplyBuffersIfNeeded(c);
1891
1892 o = listNodeValue(listFirst(c->reply));
1893 objlen = sdslen(o->ptr);
1894
1895 if (objlen == 0) {
1896 listDelNode(c->reply,listFirst(c->reply));
1897 continue;
1898 }
1899
1900 if (c->flags & REDIS_MASTER) {
1901 /* Don't reply to a master */
1902 nwritten = objlen - c->sentlen;
1903 } else {
1904 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1905 if (nwritten <= 0) break;
1906 }
1907 c->sentlen += nwritten;
1908 totwritten += nwritten;
1909 /* If we fully sent the object on head go to the next one */
1910 if (c->sentlen == objlen) {
1911 listDelNode(c->reply,listFirst(c->reply));
1912 c->sentlen = 0;
1913 }
1914 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1915 * bytes, in a single threaded server it's a good idea to serve
1916 * other clients as well, even if a very large request comes from
1917 * super fast link that is always able to accept data (in real world
1918 * scenario think about 'KEYS *' against the loopback interfae) */
1919 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1920 }
1921 if (nwritten == -1) {
1922 if (errno == EAGAIN) {
1923 nwritten = 0;
1924 } else {
1925 redisLog(REDIS_VERBOSE,
1926 "Error writing to client: %s", strerror(errno));
1927 freeClient(c);
1928 return;
1929 }
1930 }
1931 if (totwritten > 0) c->lastinteraction = time(NULL);
1932 if (listLength(c->reply) == 0) {
1933 c->sentlen = 0;
1934 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1935 }
1936 }
1937
1938 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1939 {
1940 redisClient *c = privdata;
1941 int nwritten = 0, totwritten = 0, objlen, willwrite;
1942 robj *o;
1943 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1944 int offset, ion = 0;
1945 REDIS_NOTUSED(el);
1946 REDIS_NOTUSED(mask);
1947
1948 listNode *node;
1949 while (listLength(c->reply)) {
1950 offset = c->sentlen;
1951 ion = 0;
1952 willwrite = 0;
1953
1954 /* fill-in the iov[] array */
1955 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1956 o = listNodeValue(node);
1957 objlen = sdslen(o->ptr);
1958
1959 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1960 break;
1961
1962 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1963 break; /* no more iovecs */
1964
1965 iov[ion].iov_base = ((char*)o->ptr) + offset;
1966 iov[ion].iov_len = objlen - offset;
1967 willwrite += objlen - offset;
1968 offset = 0; /* just for the first item */
1969 ion++;
1970 }
1971
1972 if(willwrite == 0)
1973 break;
1974
1975 /* write all collected blocks at once */
1976 if((nwritten = writev(fd, iov, ion)) < 0) {
1977 if (errno != EAGAIN) {
1978 redisLog(REDIS_VERBOSE,
1979 "Error writing to client: %s", strerror(errno));
1980 freeClient(c);
1981 return;
1982 }
1983 break;
1984 }
1985
1986 totwritten += nwritten;
1987 offset = c->sentlen;
1988
1989 /* remove written robjs from c->reply */
1990 while (nwritten && listLength(c->reply)) {
1991 o = listNodeValue(listFirst(c->reply));
1992 objlen = sdslen(o->ptr);
1993
1994 if(nwritten >= objlen - offset) {
1995 listDelNode(c->reply, listFirst(c->reply));
1996 nwritten -= objlen - offset;
1997 c->sentlen = 0;
1998 } else {
1999 /* partial write */
2000 c->sentlen += nwritten;
2001 break;
2002 }
2003 offset = 0;
2004 }
2005 }
2006
2007 if (totwritten > 0)
2008 c->lastinteraction = time(NULL);
2009
2010 if (listLength(c->reply) == 0) {
2011 c->sentlen = 0;
2012 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2013 }
2014 }
2015
2016 static struct redisCommand *lookupCommand(char *name) {
2017 int j = 0;
2018 while(cmdTable[j].name != NULL) {
2019 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2020 j++;
2021 }
2022 return NULL;
2023 }
2024
2025 /* resetClient prepare the client to process the next command */
2026 static void resetClient(redisClient *c) {
2027 freeClientArgv(c);
2028 c->bulklen = -1;
2029 c->multibulk = 0;
2030 }
2031
2032 /* Call() is the core of Redis execution of a command */
2033 static void call(redisClient *c, struct redisCommand *cmd) {
2034 long long dirty;
2035
2036 dirty = server.dirty;
2037 cmd->proc(c);
2038 if (server.appendonly && server.dirty-dirty)
2039 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2040 if (server.dirty-dirty && listLength(server.slaves))
2041 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
2042 if (listLength(server.monitors))
2043 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
2044 server.stat_numcommands++;
2045 }
2046
2047 /* If this function gets called we already read a whole
2048 * command, argments are in the client argv/argc fields.
2049 * processCommand() execute the command or prepare the
2050 * server for a bulk read from the client.
2051 *
2052 * If 1 is returned the client is still alive and valid and
2053 * and other operations can be performed by the caller. Otherwise
2054 * if 0 is returned the client was destroied (i.e. after QUIT). */
2055 static int processCommand(redisClient *c) {
2056 struct redisCommand *cmd;
2057
2058 /* Free some memory if needed (maxmemory setting) */
2059 if (server.maxmemory) freeMemoryIfNeeded();
2060
2061 /* Handle the multi bulk command type. This is an alternative protocol
2062 * supported by Redis in order to receive commands that are composed of
2063 * multiple binary-safe "bulk" arguments. The latency of processing is
2064 * a bit higher but this allows things like multi-sets, so if this
2065 * protocol is used only for MSET and similar commands this is a big win. */
2066 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2067 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2068 if (c->multibulk <= 0) {
2069 resetClient(c);
2070 return 1;
2071 } else {
2072 decrRefCount(c->argv[c->argc-1]);
2073 c->argc--;
2074 return 1;
2075 }
2076 } else if (c->multibulk) {
2077 if (c->bulklen == -1) {
2078 if (((char*)c->argv[0]->ptr)[0] != '$') {
2079 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2080 resetClient(c);
2081 return 1;
2082 } else {
2083 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2084 decrRefCount(c->argv[0]);
2085 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2086 c->argc--;
2087 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2088 resetClient(c);
2089 return 1;
2090 }
2091 c->argc--;
2092 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2093 return 1;
2094 }
2095 } else {
2096 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2097 c->mbargv[c->mbargc] = c->argv[0];
2098 c->mbargc++;
2099 c->argc--;
2100 c->multibulk--;
2101 if (c->multibulk == 0) {
2102 robj **auxargv;
2103 int auxargc;
2104
2105 /* Here we need to swap the multi-bulk argc/argv with the
2106 * normal argc/argv of the client structure. */
2107 auxargv = c->argv;
2108 c->argv = c->mbargv;
2109 c->mbargv = auxargv;
2110
2111 auxargc = c->argc;
2112 c->argc = c->mbargc;
2113 c->mbargc = auxargc;
2114
2115 /* We need to set bulklen to something different than -1
2116 * in order for the code below to process the command without
2117 * to try to read the last argument of a bulk command as
2118 * a special argument. */
2119 c->bulklen = 0;
2120 /* continue below and process the command */
2121 } else {
2122 c->bulklen = -1;
2123 return 1;
2124 }
2125 }
2126 }
2127 /* -- end of multi bulk commands processing -- */
2128
2129 /* The QUIT command is handled as a special case. Normal command
2130 * procs are unable to close the client connection safely */
2131 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2132 freeClient(c);
2133 return 0;
2134 }
2135
2136 /* Now lookup the command and check ASAP about trivial error conditions
2137 * such wrong arity, bad command name and so forth. */
2138 cmd = lookupCommand(c->argv[0]->ptr);
2139 if (!cmd) {
2140 addReplySds(c,
2141 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2142 (char*)c->argv[0]->ptr));
2143 resetClient(c);
2144 return 1;
2145 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2146 (c->argc < -cmd->arity)) {
2147 addReplySds(c,
2148 sdscatprintf(sdsempty(),
2149 "-ERR wrong number of arguments for '%s' command\r\n",
2150 cmd->name));
2151 resetClient(c);
2152 return 1;
2153 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2154 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2155 resetClient(c);
2156 return 1;
2157 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2158 /* This is a bulk command, we have to read the last argument yet. */
2159 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2160
2161 decrRefCount(c->argv[c->argc-1]);
2162 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2163 c->argc--;
2164 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2165 resetClient(c);
2166 return 1;
2167 }
2168 c->argc--;
2169 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2170 /* It is possible that the bulk read is already in the
2171 * buffer. Check this condition and handle it accordingly.
2172 * This is just a fast path, alternative to call processInputBuffer().
2173 * It's a good idea since the code is small and this condition
2174 * happens most of the times. */
2175 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2176 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2177 c->argc++;
2178 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2179 } else {
2180 /* Otherwise return... there is to read the last argument
2181 * from the socket. */
2182 return 1;
2183 }
2184 }
2185 /* Let's try to share objects on the command arguments vector */
2186 if (server.shareobjects) {
2187 int j;
2188 for(j = 1; j < c->argc; j++)
2189 c->argv[j] = tryObjectSharing(c->argv[j]);
2190 }
2191 /* Let's try to encode the bulk object to save space. */
2192 if (cmd->flags & REDIS_CMD_BULK)
2193 tryObjectEncoding(c->argv[c->argc-1]);
2194
2195 /* Check if the user is authenticated */
2196 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2197 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2198 resetClient(c);
2199 return 1;
2200 }
2201
2202 /* Exec the command */
2203 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2204 queueMultiCommand(c,cmd);
2205 addReply(c,shared.queued);
2206 } else {
2207 if (server.vm_enabled && server.vm_max_threads > 0 &&
2208 blockClientOnSwappedKeys(cmd,c)) return 1;
2209 call(c,cmd);
2210 }
2211
2212 /* Prepare the client for the next command */
2213 resetClient(c);
2214 return 1;
2215 }
2216
2217 static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
2218 listNode *ln;
2219 listIter li;
2220 int outc = 0, j;
2221 robj **outv;
2222 /* (args*2)+1 is enough room for args, spaces, newlines */
2223 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2224
2225 if (argc <= REDIS_STATIC_ARGS) {
2226 outv = static_outv;
2227 } else {
2228 outv = zmalloc(sizeof(robj*)*(argc*2+1));
2229 }
2230
2231 for (j = 0; j < argc; j++) {
2232 if (j != 0) outv[outc++] = shared.space;
2233 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2234 robj *lenobj;
2235
2236 lenobj = createObject(REDIS_STRING,
2237 sdscatprintf(sdsempty(),"%lu\r\n",
2238 (unsigned long) stringObjectLen(argv[j])));
2239 lenobj->refcount = 0;
2240 outv[outc++] = lenobj;
2241 }
2242 outv[outc++] = argv[j];
2243 }
2244 outv[outc++] = shared.crlf;
2245
2246 /* Increment all the refcounts at start and decrement at end in order to
2247 * be sure to free objects if there is no slave in a replication state
2248 * able to be feed with commands */
2249 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2250 listRewind(slaves,&li);
2251 while((ln = listNext(&li))) {
2252 redisClient *slave = ln->value;
2253
2254 /* Don't feed slaves that are still waiting for BGSAVE to start */
2255 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2256
2257 /* Feed all the other slaves, MONITORs and so on */
2258 if (slave->slaveseldb != dictid) {
2259 robj *selectcmd;
2260
2261 switch(dictid) {
2262 case 0: selectcmd = shared.select0; break;
2263 case 1: selectcmd = shared.select1; break;
2264 case 2: selectcmd = shared.select2; break;
2265 case 3: selectcmd = shared.select3; break;
2266 case 4: selectcmd = shared.select4; break;
2267 case 5: selectcmd = shared.select5; break;
2268 case 6: selectcmd = shared.select6; break;
2269 case 7: selectcmd = shared.select7; break;
2270 case 8: selectcmd = shared.select8; break;
2271 case 9: selectcmd = shared.select9; break;
2272 default:
2273 selectcmd = createObject(REDIS_STRING,
2274 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2275 selectcmd->refcount = 0;
2276 break;
2277 }
2278 addReply(slave,selectcmd);
2279 slave->slaveseldb = dictid;
2280 }
2281 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2282 }
2283 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2284 if (outv != static_outv) zfree(outv);
2285 }
2286
2287 static void processInputBuffer(redisClient *c) {
2288 again:
2289 /* Before to process the input buffer, make sure the client is not
2290 * waitig for a blocking operation such as BLPOP. Note that the first
2291 * iteration the client is never blocked, otherwise the processInputBuffer
2292 * would not be called at all, but after the execution of the first commands
2293 * in the input buffer the client may be blocked, and the "goto again"
2294 * will try to reiterate. The following line will make it return asap. */
2295 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2296 if (c->bulklen == -1) {
2297 /* Read the first line of the query */
2298 char *p = strchr(c->querybuf,'\n');
2299 size_t querylen;
2300
2301 if (p) {
2302 sds query, *argv;
2303 int argc, j;
2304
2305 query = c->querybuf;
2306 c->querybuf = sdsempty();
2307 querylen = 1+(p-(query));
2308 if (sdslen(query) > querylen) {
2309 /* leave data after the first line of the query in the buffer */
2310 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2311 }
2312 *p = '\0'; /* remove "\n" */
2313 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2314 sdsupdatelen(query);
2315
2316 /* Now we can split the query in arguments */
2317 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2318 sdsfree(query);
2319
2320 if (c->argv) zfree(c->argv);
2321 c->argv = zmalloc(sizeof(robj*)*argc);
2322
2323 for (j = 0; j < argc; j++) {
2324 if (sdslen(argv[j])) {
2325 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2326 c->argc++;
2327 } else {
2328 sdsfree(argv[j]);
2329 }
2330 }
2331 zfree(argv);
2332 if (c->argc) {
2333 /* Execute the command. If the client is still valid
2334 * after processCommand() return and there is something
2335 * on the query buffer try to process the next command. */
2336 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2337 } else {
2338 /* Nothing to process, argc == 0. Just process the query
2339 * buffer if it's not empty or return to the caller */
2340 if (sdslen(c->querybuf)) goto again;
2341 }
2342 return;
2343 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2344 redisLog(REDIS_VERBOSE, "Client protocol error");
2345 freeClient(c);
2346 return;
2347 }
2348 } else {
2349 /* Bulk read handling. Note that if we are at this point
2350 the client already sent a command terminated with a newline,
2351 we are reading the bulk data that is actually the last
2352 argument of the command. */
2353 int qbl = sdslen(c->querybuf);
2354
2355 if (c->bulklen <= qbl) {
2356 /* Copy everything but the final CRLF as final argument */
2357 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2358 c->argc++;
2359 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2360 /* Process the command. If the client is still valid after
2361 * the processing and there is more data in the buffer
2362 * try to parse it. */
2363 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2364 return;
2365 }
2366 }
2367 }
2368
2369 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2370 redisClient *c = (redisClient*) privdata;
2371 char buf[REDIS_IOBUF_LEN];
2372 int nread;
2373 REDIS_NOTUSED(el);
2374 REDIS_NOTUSED(mask);
2375
2376 nread = read(fd, buf, REDIS_IOBUF_LEN);
2377 if (nread == -1) {
2378 if (errno == EAGAIN) {
2379 nread = 0;
2380 } else {
2381 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2382 freeClient(c);
2383 return;
2384 }
2385 } else if (nread == 0) {
2386 redisLog(REDIS_VERBOSE, "Client closed connection");
2387 freeClient(c);
2388 return;
2389 }
2390 if (nread) {
2391 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2392 c->lastinteraction = time(NULL);
2393 } else {
2394 return;
2395 }
2396 if (!(c->flags & REDIS_BLOCKED))
2397 processInputBuffer(c);
2398 }
2399
2400 static int selectDb(redisClient *c, int id) {
2401 if (id < 0 || id >= server.dbnum)
2402 return REDIS_ERR;
2403 c->db = &server.db[id];
2404 return REDIS_OK;
2405 }
2406
2407 static void *dupClientReplyValue(void *o) {
2408 incrRefCount((robj*)o);
2409 return o;
2410 }
2411
2412 static redisClient *createClient(int fd) {
2413 redisClient *c = zmalloc(sizeof(*c));
2414
2415 anetNonBlock(NULL,fd);
2416 anetTcpNoDelay(NULL,fd);
2417 if (!c) return NULL;
2418 selectDb(c,0);
2419 c->fd = fd;
2420 c->querybuf = sdsempty();
2421 c->argc = 0;
2422 c->argv = NULL;
2423 c->bulklen = -1;
2424 c->multibulk = 0;
2425 c->mbargc = 0;
2426 c->mbargv = NULL;
2427 c->sentlen = 0;
2428 c->flags = 0;
2429 c->lastinteraction = time(NULL);
2430 c->authenticated = 0;
2431 c->replstate = REDIS_REPL_NONE;
2432 c->reply = listCreate();
2433 listSetFreeMethod(c->reply,decrRefCount);
2434 listSetDupMethod(c->reply,dupClientReplyValue);
2435 c->blockingkeys = NULL;
2436 c->blockingkeysnum = 0;
2437 c->io_keys = listCreate();
2438 listSetFreeMethod(c->io_keys,decrRefCount);
2439 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2440 readQueryFromClient, c) == AE_ERR) {
2441 freeClient(c);
2442 return NULL;
2443 }
2444 listAddNodeTail(server.clients,c);
2445 initClientMultiState(c);
2446 return c;
2447 }
2448
2449 static void addReply(redisClient *c, robj *obj) {
2450 if (listLength(c->reply) == 0 &&
2451 (c->replstate == REDIS_REPL_NONE ||
2452 c->replstate == REDIS_REPL_ONLINE) &&
2453 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2454 sendReplyToClient, c) == AE_ERR) return;
2455
2456 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2457 obj = dupStringObject(obj);
2458 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2459 }
2460 listAddNodeTail(c->reply,getDecodedObject(obj));
2461 }
2462
2463 static void addReplySds(redisClient *c, sds s) {
2464 robj *o = createObject(REDIS_STRING,s);
2465 addReply(c,o);
2466 decrRefCount(o);
2467 }
2468
2469 static void addReplyDouble(redisClient *c, double d) {
2470 char buf[128];
2471
2472 snprintf(buf,sizeof(buf),"%.17g",d);
2473 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2474 (unsigned long) strlen(buf),buf));
2475 }
2476
2477 static void addReplyLong(redisClient *c, long l) {
2478 char buf[128];
2479 size_t len;
2480
2481 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2482 addReplySds(c,sdsnewlen(buf,len));
2483 }
2484
2485 static void addReplyUlong(redisClient *c, unsigned long ul) {
2486 char buf[128];
2487 size_t len;
2488
2489 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2490 addReplySds(c,sdsnewlen(buf,len));
2491 }
2492
2493 static void addReplyBulkLen(redisClient *c, robj *obj) {
2494 size_t len;
2495
2496 if (obj->encoding == REDIS_ENCODING_RAW) {
2497 len = sdslen(obj->ptr);
2498 } else {
2499 long n = (long)obj->ptr;
2500
2501 /* Compute how many bytes will take this integer as a radix 10 string */
2502 len = 1;
2503 if (n < 0) {
2504 len++;
2505 n = -n;
2506 }
2507 while((n = n/10) != 0) {
2508 len++;
2509 }
2510 }
2511 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2512 }
2513
2514 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2515 int cport, cfd;
2516 char cip[128];
2517 redisClient *c;
2518 REDIS_NOTUSED(el);
2519 REDIS_NOTUSED(mask);
2520 REDIS_NOTUSED(privdata);
2521
2522 cfd = anetAccept(server.neterr, fd, cip, &cport);
2523 if (cfd == AE_ERR) {
2524 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2525 return;
2526 }
2527 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2528 if ((c = createClient(cfd)) == NULL) {
2529 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2530 close(cfd); /* May be already closed, just ingore errors */
2531 return;
2532 }
2533 /* If maxclient directive is set and this is one client more... close the
2534 * connection. Note that we create the client instead to check before
2535 * for this condition, since now the socket is already set in nonblocking
2536 * mode and we can send an error for free using the Kernel I/O */
2537 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2538 char *err = "-ERR max number of clients reached\r\n";
2539
2540 /* That's a best effort error message, don't check write errors */
2541 if (write(c->fd,err,strlen(err)) == -1) {
2542 /* Nothing to do, Just to avoid the warning... */
2543 }
2544 freeClient(c);
2545 return;
2546 }
2547 server.stat_numconnections++;
2548 }
2549
2550 /* ======================= Redis objects implementation ===================== */
2551
2552 static robj *createObject(int type, void *ptr) {
2553 robj *o;
2554
2555 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2556 if (listLength(server.objfreelist)) {
2557 listNode *head = listFirst(server.objfreelist);
2558 o = listNodeValue(head);
2559 listDelNode(server.objfreelist,head);
2560 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2561 } else {
2562 if (server.vm_enabled) {
2563 pthread_mutex_unlock(&server.obj_freelist_mutex);
2564 o = zmalloc(sizeof(*o));
2565 } else {
2566 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2567 }
2568 }
2569 o->type = type;
2570 o->encoding = REDIS_ENCODING_RAW;
2571 o->ptr = ptr;
2572 o->refcount = 1;
2573 if (server.vm_enabled) {
2574 /* Note that this code may run in the context of an I/O thread
2575 * and accessing to server.unixtime in theory is an error
2576 * (no locks). But in practice this is safe, and even if we read
2577 * garbage Redis will not fail, as it's just a statistical info */
2578 o->vm.atime = server.unixtime;
2579 o->storage = REDIS_VM_MEMORY;
2580 }
2581 return o;
2582 }
2583
2584 static robj *createStringObject(char *ptr, size_t len) {
2585 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2586 }
2587
2588 static robj *dupStringObject(robj *o) {
2589 assert(o->encoding == REDIS_ENCODING_RAW);
2590 return createStringObject(o->ptr,sdslen(o->ptr));
2591 }
2592
2593 static robj *createListObject(void) {
2594 list *l = listCreate();
2595
2596 listSetFreeMethod(l,decrRefCount);
2597 return createObject(REDIS_LIST,l);
2598 }
2599
2600 static robj *createSetObject(void) {
2601 dict *d = dictCreate(&setDictType,NULL);
2602 return createObject(REDIS_SET,d);
2603 }
2604
2605 static robj *createHashObject(void) {
2606 /* All the Hashes start as zipmaps. Will be automatically converted
2607 * into hash tables if there are enough elements or big elements
2608 * inside. */
2609 unsigned char *zm = zipmapNew();
2610 robj *o = createObject(REDIS_HASH,zm);
2611 o->encoding = REDIS_ENCODING_ZIPMAP;
2612 return o;
2613 }
2614
2615 static robj *createZsetObject(void) {
2616 zset *zs = zmalloc(sizeof(*zs));
2617
2618 zs->dict = dictCreate(&zsetDictType,NULL);
2619 zs->zsl = zslCreate();
2620 return createObject(REDIS_ZSET,zs);
2621 }
2622
2623 static void freeStringObject(robj *o) {
2624 if (o->encoding == REDIS_ENCODING_RAW) {
2625 sdsfree(o->ptr);
2626 }
2627 }
2628
2629 static void freeListObject(robj *o) {
2630 listRelease((list*) o->ptr);
2631 }
2632
2633 static void freeSetObject(robj *o) {
2634 dictRelease((dict*) o->ptr);
2635 }
2636
2637 static void freeZsetObject(robj *o) {
2638 zset *zs = o->ptr;
2639
2640 dictRelease(zs->dict);
2641 zslFree(zs->zsl);
2642 zfree(zs);
2643 }
2644
2645 static void freeHashObject(robj *o) {
2646 switch (o->encoding) {
2647 case REDIS_ENCODING_HT:
2648 dictRelease((dict*) o->ptr);
2649 break;
2650 case REDIS_ENCODING_ZIPMAP:
2651 zfree(o->ptr);
2652 break;
2653 default:
2654 redisAssert(0);
2655 break;
2656 }
2657 }
2658
2659 static void incrRefCount(robj *o) {
2660 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2661 o->refcount++;
2662 }
2663
2664 static void decrRefCount(void *obj) {
2665 robj *o = obj;
2666
2667 /* Object is a key of a swapped out value, or in the process of being
2668 * loaded. */
2669 if (server.vm_enabled &&
2670 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2671 {
2672 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2673 redisAssert(o->refcount == 1);
2674 }
2675 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2676 redisAssert(o->type == REDIS_STRING);
2677 freeStringObject(o);
2678 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2679 pthread_mutex_lock(&server.obj_freelist_mutex);
2680 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2681 !listAddNodeHead(server.objfreelist,o))
2682 zfree(o);
2683 pthread_mutex_unlock(&server.obj_freelist_mutex);
2684 server.vm_stats_swapped_objects--;
2685 return;
2686 }
2687 /* Object is in memory, or in the process of being swapped out. */
2688 if (--(o->refcount) == 0) {
2689 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2690 vmCancelThreadedIOJob(obj);
2691 switch(o->type) {
2692 case REDIS_STRING: freeStringObject(o); break;
2693 case REDIS_LIST: freeListObject(o); break;
2694 case REDIS_SET: freeSetObject(o); break;
2695 case REDIS_ZSET: freeZsetObject(o); break;
2696 case REDIS_HASH: freeHashObject(o); break;
2697 default: redisAssert(0 != 0); break;
2698 }
2699 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2700 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2701 !listAddNodeHead(server.objfreelist,o))
2702 zfree(o);
2703 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2704 }
2705 }
2706
2707 static robj *lookupKey(redisDb *db, robj *key) {
2708 dictEntry *de = dictFind(db->dict,key);
2709 if (de) {
2710 robj *key = dictGetEntryKey(de);
2711 robj *val = dictGetEntryVal(de);
2712
2713 if (server.vm_enabled) {
2714 if (key->storage == REDIS_VM_MEMORY ||
2715 key->storage == REDIS_VM_SWAPPING)
2716 {
2717 /* If we were swapping the object out, stop it, this key
2718 * was requested. */
2719 if (key->storage == REDIS_VM_SWAPPING)
2720 vmCancelThreadedIOJob(key);
2721 /* Update the access time of the key for the aging algorithm. */
2722 key->vm.atime = server.unixtime;
2723 } else {
2724 int notify = (key->storage == REDIS_VM_LOADING);
2725
2726 /* Our value was swapped on disk. Bring it at home. */
2727 redisAssert(val == NULL);
2728 val = vmLoadObject(key);
2729 dictGetEntryVal(de) = val;
2730
2731 /* Clients blocked by the VM subsystem may be waiting for
2732 * this key... */
2733 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2734 }
2735 }
2736 return val;
2737 } else {
2738 return NULL;
2739 }
2740 }
2741
2742 static robj *lookupKeyRead(redisDb *db, robj *key) {
2743 expireIfNeeded(db,key);
2744 return lookupKey(db,key);
2745 }
2746
2747 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2748 deleteIfVolatile(db,key);
2749 return lookupKey(db,key);
2750 }
2751
2752 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2753 robj *o = lookupKeyRead(c->db, key);
2754 if (!o) addReply(c,reply);
2755 return o;
2756 }
2757
2758 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2759 robj *o = lookupKeyWrite(c->db, key);
2760 if (!o) addReply(c,reply);
2761 return o;
2762 }
2763
2764 static int checkType(redisClient *c, robj *o, int type) {
2765 if (o->type != type) {
2766 addReply(c,shared.wrongtypeerr);
2767 return 1;
2768 }
2769 return 0;
2770 }
2771
2772 static int deleteKey(redisDb *db, robj *key) {
2773 int retval;
2774
2775 /* We need to protect key from destruction: after the first dictDelete()
2776 * it may happen that 'key' is no longer valid if we don't increment
2777 * it's count. This may happen when we get the object reference directly
2778 * from the hash table with dictRandomKey() or dict iterators */
2779 incrRefCount(key);
2780 if (dictSize(db->expires)) dictDelete(db->expires,key);
2781 retval = dictDelete(db->dict,key);
2782 decrRefCount(key);
2783
2784 return retval == DICT_OK;
2785 }
2786
2787 /* Try to share an object against the shared objects pool */
2788 static robj *tryObjectSharing(robj *o) {
2789 struct dictEntry *de;
2790 unsigned long c;
2791
2792 if (o == NULL || server.shareobjects == 0) return o;
2793
2794 redisAssert(o->type == REDIS_STRING);
2795 de = dictFind(server.sharingpool,o);
2796 if (de) {
2797 robj *shared = dictGetEntryKey(de);
2798
2799 c = ((unsigned long) dictGetEntryVal(de))+1;
2800 dictGetEntryVal(de) = (void*) c;
2801 incrRefCount(shared);
2802 decrRefCount(o);
2803 return shared;
2804 } else {
2805 /* Here we are using a stream algorihtm: Every time an object is
2806 * shared we increment its count, everytime there is a miss we
2807 * recrement the counter of a random object. If this object reaches
2808 * zero we remove the object and put the current object instead. */
2809 if (dictSize(server.sharingpool) >=
2810 server.sharingpoolsize) {
2811 de = dictGetRandomKey(server.sharingpool);
2812 redisAssert(de != NULL);
2813 c = ((unsigned long) dictGetEntryVal(de))-1;
2814 dictGetEntryVal(de) = (void*) c;
2815 if (c == 0) {
2816 dictDelete(server.sharingpool,de->key);
2817 }
2818 } else {
2819 c = 0; /* If the pool is empty we want to add this object */
2820 }
2821 if (c == 0) {
2822 int retval;
2823
2824 retval = dictAdd(server.sharingpool,o,(void*)1);
2825 redisAssert(retval == DICT_OK);
2826 incrRefCount(o);
2827 }
2828 return o;
2829 }
2830 }
2831
2832 /* Check if the nul-terminated string 's' can be represented by a long
2833 * (that is, is a number that fits into long without any other space or
2834 * character before or after the digits).
2835 *
2836 * If so, the function returns REDIS_OK and *longval is set to the value
2837 * of the number. Otherwise REDIS_ERR is returned */
2838 static int isStringRepresentableAsLong(sds s, long *longval) {
2839 char buf[32], *endptr;
2840 long value;
2841 int slen;
2842
2843 value = strtol(s, &endptr, 10);
2844 if (endptr[0] != '\0') return REDIS_ERR;
2845 slen = snprintf(buf,32,"%ld",value);
2846
2847 /* If the number converted back into a string is not identical
2848 * then it's not possible to encode the string as integer */
2849 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2850 if (longval) *longval = value;
2851 return REDIS_OK;
2852 }
2853
2854 /* Try to encode a string object in order to save space */
2855 static int tryObjectEncoding(robj *o) {
2856 long value;
2857 sds s = o->ptr;
2858
2859 if (o->encoding != REDIS_ENCODING_RAW)
2860 return REDIS_ERR; /* Already encoded */
2861
2862 /* It's not save to encode shared objects: shared objects can be shared
2863 * everywhere in the "object space" of Redis. Encoded objects can only
2864 * appear as "values" (and not, for instance, as keys) */
2865 if (o->refcount > 1) return REDIS_ERR;
2866
2867 /* Currently we try to encode only strings */
2868 redisAssert(o->type == REDIS_STRING);
2869
2870 /* Check if we can represent this string as a long integer */
2871 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2872
2873 /* Ok, this object can be encoded */
2874 o->encoding = REDIS_ENCODING_INT;
2875 sdsfree(o->ptr);
2876 o->ptr = (void*) value;
2877 return REDIS_OK;
2878 }
2879
2880 /* Get a decoded version of an encoded object (returned as a new object).
2881 * If the object is already raw-encoded just increment the ref count. */
2882 static robj *getDecodedObject(robj *o) {
2883 robj *dec;
2884
2885 if (o->encoding == REDIS_ENCODING_RAW) {
2886 incrRefCount(o);
2887 return o;
2888 }
2889 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2890 char buf[32];
2891
2892 snprintf(buf,32,"%ld",(long)o->ptr);
2893 dec = createStringObject(buf,strlen(buf));
2894 return dec;
2895 } else {
2896 redisAssert(1 != 1);
2897 }
2898 }
2899
2900 /* Compare two string objects via strcmp() or alike.
2901 * Note that the objects may be integer-encoded. In such a case we
2902 * use snprintf() to get a string representation of the numbers on the stack
2903 * and compare the strings, it's much faster than calling getDecodedObject().
2904 *
2905 * Important note: if objects are not integer encoded, but binary-safe strings,
2906 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2907 * binary safe. */
2908 static int compareStringObjects(robj *a, robj *b) {
2909 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
2910 char bufa[128], bufb[128], *astr, *bstr;
2911 int bothsds = 1;
2912
2913 if (a == b) return 0;
2914 if (a->encoding != REDIS_ENCODING_RAW) {
2915 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2916 astr = bufa;
2917 bothsds = 0;
2918 } else {
2919 astr = a->ptr;
2920 }
2921 if (b->encoding != REDIS_ENCODING_RAW) {
2922 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2923 bstr = bufb;
2924 bothsds = 0;
2925 } else {
2926 bstr = b->ptr;
2927 }
2928 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
2929 }
2930
2931 static size_t stringObjectLen(robj *o) {
2932 redisAssert(o->type == REDIS_STRING);
2933 if (o->encoding == REDIS_ENCODING_RAW) {
2934 return sdslen(o->ptr);
2935 } else {
2936 char buf[32];
2937
2938 return snprintf(buf,32,"%ld",(long)o->ptr);
2939 }
2940 }
2941
2942 /*============================ RDB saving/loading =========================== */
2943
2944 static int rdbSaveType(FILE *fp, unsigned char type) {
2945 if (fwrite(&type,1,1,fp) == 0) return -1;
2946 return 0;
2947 }
2948
2949 static int rdbSaveTime(FILE *fp, time_t t) {
2950 int32_t t32 = (int32_t) t;
2951 if (fwrite(&t32,4,1,fp) == 0) return -1;
2952 return 0;
2953 }
2954
2955 /* check rdbLoadLen() comments for more info */
2956 static int rdbSaveLen(FILE *fp, uint32_t len) {
2957 unsigned char buf[2];
2958
2959 if (len < (1<<6)) {
2960 /* Save a 6 bit len */
2961 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
2962 if (fwrite(buf,1,1,fp) == 0) return -1;
2963 } else if (len < (1<<14)) {
2964 /* Save a 14 bit len */
2965 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
2966 buf[1] = len&0xFF;
2967 if (fwrite(buf,2,1,fp) == 0) return -1;
2968 } else {
2969 /* Save a 32 bit len */
2970 buf[0] = (REDIS_RDB_32BITLEN<<6);
2971 if (fwrite(buf,1,1,fp) == 0) return -1;
2972 len = htonl(len);
2973 if (fwrite(&len,4,1,fp) == 0) return -1;
2974 }
2975 return 0;
2976 }
2977
2978 /* String objects in the form "2391" "-100" without any space and with a
2979 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2980 * encoded as integers to save space */
2981 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
2982 long long value;
2983 char *endptr, buf[32];
2984
2985 /* Check if it's possible to encode this value as a number */
2986 value = strtoll(s, &endptr, 10);
2987 if (endptr[0] != '\0') return 0;
2988 snprintf(buf,32,"%lld",value);
2989
2990 /* If the number converted back into a string is not identical
2991 * then it's not possible to encode the string as integer */
2992 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
2993
2994 /* Finally check if it fits in our ranges */
2995 if (value >= -(1<<7) && value <= (1<<7)-1) {
2996 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2997 enc[1] = value&0xFF;
2998 return 2;
2999 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3000 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3001 enc[1] = value&0xFF;
3002 enc[2] = (value>>8)&0xFF;
3003 return 3;
3004 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3005 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3006 enc[1] = value&0xFF;
3007 enc[2] = (value>>8)&0xFF;
3008 enc[3] = (value>>16)&0xFF;
3009 enc[4] = (value>>24)&0xFF;
3010 return 5;
3011 } else {
3012 return 0;
3013 }
3014 }
3015
3016 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3017 size_t comprlen, outlen;
3018 unsigned char byte;
3019 void *out;
3020
3021 /* We require at least four bytes compression for this to be worth it */
3022 if (len <= 4) return 0;
3023 outlen = len-4;
3024 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3025 comprlen = lzf_compress(s, len, out, outlen);
3026 if (comprlen == 0) {
3027 zfree(out);
3028 return 0;
3029 }
3030 /* Data compressed! Let's save it on disk */
3031 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3032 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3033 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3034 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3035 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3036 zfree(out);
3037 return comprlen;
3038
3039 writeerr:
3040 zfree(out);
3041 return -1;
3042 }
3043
3044 /* Save a string objet as [len][data] on disk. If the object is a string
3045 * representation of an integer value we try to safe it in a special form */
3046 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3047 int enclen;
3048
3049 /* Try integer encoding */
3050 if (len <= 11) {
3051 unsigned char buf[5];
3052 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3053 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3054 return 0;
3055 }
3056 }
3057
3058 /* Try LZF compression - under 20 bytes it's unable to compress even
3059 * aaaaaaaaaaaaaaaaaa so skip it */
3060 if (server.rdbcompression && len > 20) {
3061 int retval;
3062
3063 retval = rdbSaveLzfStringObject(fp,s,len);
3064 if (retval == -1) return -1;
3065 if (retval > 0) return 0;
3066 /* retval == 0 means data can't be compressed, save the old way */
3067 }
3068
3069 /* Store verbatim */
3070 if (rdbSaveLen(fp,len) == -1) return -1;
3071 if (len && fwrite(s,len,1,fp) == 0) return -1;
3072 return 0;
3073 }
3074
3075 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3076 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3077 int retval;
3078
3079 /* Avoid incr/decr ref count business when possible.
3080 * This plays well with copy-on-write given that we are probably
3081 * in a child process (BGSAVE). Also this makes sure key objects
3082 * of swapped objects are not incRefCount-ed (an assert does not allow
3083 * this in order to avoid bugs) */
3084 if (obj->encoding != REDIS_ENCODING_RAW) {
3085 obj = getDecodedObject(obj);
3086 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3087 decrRefCount(obj);
3088 } else {
3089 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3090 }
3091 return retval;
3092 }
3093
3094 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3095 * 8 bit integer specifing the length of the representation.
3096 * This 8 bit integer has special values in order to specify the following
3097 * conditions:
3098 * 253: not a number
3099 * 254: + inf
3100 * 255: - inf
3101 */
3102 static int rdbSaveDoubleValue(FILE *fp, double val) {
3103 unsigned char buf[128];
3104 int len;
3105
3106 if (isnan(val)) {
3107 buf[0] = 253;
3108 len = 1;
3109 } else if (!isfinite(val)) {
3110 len = 1;
3111 buf[0] = (val < 0) ? 255 : 254;
3112 } else {
3113 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3114 buf[0] = strlen((char*)buf+1);
3115 len = buf[0]+1;
3116 }
3117 if (fwrite(buf,len,1,fp) == 0) return -1;
3118 return 0;
3119 }
3120
3121 /* Save a Redis object. */
3122 static int rdbSaveObject(FILE *fp, robj *o) {
3123 if (o->type == REDIS_STRING) {
3124 /* Save a string value */
3125 if (rdbSaveStringObject(fp,o) == -1) return -1;
3126 } else if (o->type == REDIS_LIST) {
3127 /* Save a list value */
3128 list *list = o->ptr;
3129 listIter li;
3130 listNode *ln;
3131
3132 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3133 listRewind(list,&li);
3134 while((ln = listNext(&li))) {
3135 robj *eleobj = listNodeValue(ln);
3136
3137 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3138 }
3139 } else if (o->type == REDIS_SET) {
3140 /* Save a set value */
3141 dict *set = o->ptr;
3142 dictIterator *di = dictGetIterator(set);
3143 dictEntry *de;
3144
3145 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3146 while((de = dictNext(di)) != NULL) {
3147 robj *eleobj = dictGetEntryKey(de);
3148
3149 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3150 }
3151 dictReleaseIterator(di);
3152 } else if (o->type == REDIS_ZSET) {
3153 /* Save a set value */
3154 zset *zs = o->ptr;
3155 dictIterator *di = dictGetIterator(zs->dict);
3156 dictEntry *de;
3157
3158 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3159 while((de = dictNext(di)) != NULL) {
3160 robj *eleobj = dictGetEntryKey(de);
3161 double *score = dictGetEntryVal(de);
3162
3163 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3164 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3165 }
3166 dictReleaseIterator(di);
3167 } else if (o->type == REDIS_HASH) {
3168 /* Save a hash value */
3169 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3170 unsigned char *p = zipmapRewind(o->ptr);
3171 unsigned int count = zipmapLen(o->ptr);
3172 unsigned char *key, *val;
3173 unsigned int klen, vlen;
3174
3175 if (rdbSaveLen(fp,count) == -1) return -1;
3176 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3177 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3178 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3179 }
3180 } else {
3181 dictIterator *di = dictGetIterator(o->ptr);
3182 dictEntry *de;
3183
3184 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3185 while((de = dictNext(di)) != NULL) {
3186 robj *key = dictGetEntryKey(de);
3187 robj *val = dictGetEntryVal(de);
3188
3189 if (rdbSaveStringObject(fp,key) == -1) return -1;
3190 if (rdbSaveStringObject(fp,val) == -1) return -1;
3191 }
3192 dictReleaseIterator(di);
3193 }
3194 } else {
3195 redisAssert(0 != 0);
3196 }
3197 return 0;
3198 }
3199
3200 /* Return the length the object will have on disk if saved with
3201 * the rdbSaveObject() function. Currently we use a trick to get
3202 * this length with very little changes to the code. In the future
3203 * we could switch to a faster solution. */
3204 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3205 if (fp == NULL) fp = server.devnull;
3206 rewind(fp);
3207 assert(rdbSaveObject(fp,o) != 1);
3208 return ftello(fp);
3209 }
3210
3211 /* Return the number of pages required to save this object in the swap file */
3212 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3213 off_t bytes = rdbSavedObjectLen(o,fp);
3214
3215 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3216 }
3217
3218 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3219 static int rdbSave(char *filename) {
3220 dictIterator *di = NULL;
3221 dictEntry *de;
3222 FILE *fp;
3223 char tmpfile[256];
3224 int j;
3225 time_t now = time(NULL);
3226
3227 /* Wait for I/O therads to terminate, just in case this is a
3228 * foreground-saving, to avoid seeking the swap file descriptor at the
3229 * same time. */
3230 if (server.vm_enabled)
3231 waitEmptyIOJobsQueue();
3232
3233 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3234 fp = fopen(tmpfile,"w");
3235 if (!fp) {
3236 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3237 return REDIS_ERR;
3238 }
3239 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3240 for (j = 0; j < server.dbnum; j++) {
3241 redisDb *db = server.db+j;
3242 dict *d = db->dict;
3243 if (dictSize(d) == 0) continue;
3244 di = dictGetIterator(d);
3245 if (!di) {
3246 fclose(fp);
3247 return REDIS_ERR;
3248 }
3249
3250 /* Write the SELECT DB opcode */
3251 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3252 if (rdbSaveLen(fp,j) == -1) goto werr;
3253
3254 /* Iterate this DB writing every entry */
3255 while((de = dictNext(di)) != NULL) {
3256 robj *key = dictGetEntryKey(de);
3257 robj *o = dictGetEntryVal(de);
3258 time_t expiretime = getExpire(db,key);
3259
3260 /* Save the expire time */
3261 if (expiretime != -1) {
3262 /* If this key is already expired skip it */
3263 if (expiretime < now) continue;
3264 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3265 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3266 }
3267 /* Save the key and associated value. This requires special
3268 * handling if the value is swapped out. */
3269 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3270 key->storage == REDIS_VM_SWAPPING) {
3271 /* Save type, key, value */
3272 if (rdbSaveType(fp,o->type) == -1) goto werr;
3273 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3274 if (rdbSaveObject(fp,o) == -1) goto werr;
3275 } else {
3276 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3277 robj *po;
3278 /* Get a preview of the object in memory */
3279 po = vmPreviewObject(key);
3280 /* Save type, key, value */
3281 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3282 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3283 if (rdbSaveObject(fp,po) == -1) goto werr;
3284 /* Remove the loaded object from memory */
3285 decrRefCount(po);
3286 }
3287 }
3288 dictReleaseIterator(di);
3289 }
3290 /* EOF opcode */
3291 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3292
3293 /* Make sure data will not remain on the OS's output buffers */
3294 fflush(fp);
3295 fsync(fileno(fp));
3296 fclose(fp);
3297
3298 /* Use RENAME to make sure the DB file is changed atomically only
3299 * if the generate DB file is ok. */
3300 if (rename(tmpfile,filename) == -1) {
3301 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3302 unlink(tmpfile);
3303 return REDIS_ERR;
3304 }
3305 redisLog(REDIS_NOTICE,"DB saved on disk");
3306 server.dirty = 0;
3307 server.lastsave = time(NULL);
3308 return REDIS_OK;
3309
3310 werr:
3311 fclose(fp);
3312 unlink(tmpfile);
3313 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3314 if (di) dictReleaseIterator(di);
3315 return REDIS_ERR;
3316 }
3317
3318 static int rdbSaveBackground(char *filename) {
3319 pid_t childpid;
3320
3321 if (server.bgsavechildpid != -1) return REDIS_ERR;
3322 if (server.vm_enabled) waitEmptyIOJobsQueue();
3323 if ((childpid = fork()) == 0) {
3324 /* Child */
3325 if (server.vm_enabled) vmReopenSwapFile();
3326 close(server.fd);
3327 if (rdbSave(filename) == REDIS_OK) {
3328 _exit(0);
3329 } else {
3330 _exit(1);
3331 }
3332 } else {
3333 /* Parent */
3334 if (childpid == -1) {
3335 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3336 strerror(errno));
3337 return REDIS_ERR;
3338 }
3339 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3340 server.bgsavechildpid = childpid;
3341 return REDIS_OK;
3342 }
3343 return REDIS_OK; /* unreached */
3344 }
3345
3346 static void rdbRemoveTempFile(pid_t childpid) {
3347 char tmpfile[256];
3348
3349 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3350 unlink(tmpfile);
3351 }
3352
3353 static int rdbLoadType(FILE *fp) {
3354 unsigned char type;
3355 if (fread(&type,1,1,fp) == 0) return -1;
3356 return type;
3357 }
3358
3359 static time_t rdbLoadTime(FILE *fp) {
3360 int32_t t32;
3361 if (fread(&t32,4,1,fp) == 0) return -1;
3362 return (time_t) t32;
3363 }
3364
3365 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3366 * of this file for a description of how this are stored on disk.
3367 *
3368 * isencoded is set to 1 if the readed length is not actually a length but
3369 * an "encoding type", check the above comments for more info */
3370 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3371 unsigned char buf[2];
3372 uint32_t len;
3373 int type;
3374
3375 if (isencoded) *isencoded = 0;
3376 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3377 type = (buf[0]&0xC0)>>6;
3378 if (type == REDIS_RDB_6BITLEN) {
3379 /* Read a 6 bit len */
3380 return buf[0]&0x3F;
3381 } else if (type == REDIS_RDB_ENCVAL) {
3382 /* Read a 6 bit len encoding type */
3383 if (isencoded) *isencoded = 1;
3384 return buf[0]&0x3F;
3385 } else if (type == REDIS_RDB_14BITLEN) {
3386 /* Read a 14 bit len */
3387 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3388 return ((buf[0]&0x3F)<<8)|buf[1];
3389 } else {
3390 /* Read a 32 bit len */
3391 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3392 return ntohl(len);
3393 }
3394 }
3395
3396 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3397 unsigned char enc[4];
3398 long long val;
3399
3400 if (enctype == REDIS_RDB_ENC_INT8) {
3401 if (fread(enc,1,1,fp) == 0) return NULL;
3402 val = (signed char)enc[0];
3403 } else if (enctype == REDIS_RDB_ENC_INT16) {
3404 uint16_t v;
3405 if (fread(enc,2,1,fp) == 0) return NULL;
3406 v = enc[0]|(enc[1]<<8);
3407 val = (int16_t)v;
3408 } else if (enctype == REDIS_RDB_ENC_INT32) {
3409 uint32_t v;
3410 if (fread(enc,4,1,fp) == 0) return NULL;
3411 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3412 val = (int32_t)v;
3413 } else {
3414 val = 0; /* anti-warning */
3415 redisAssert(0!=0);
3416 }
3417 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3418 }
3419
3420 static robj *rdbLoadLzfStringObject(FILE*fp) {
3421 unsigned int len, clen;
3422 unsigned char *c = NULL;
3423 sds val = NULL;
3424
3425 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3426 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3427 if ((c = zmalloc(clen)) == NULL) goto err;
3428 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3429 if (fread(c,clen,1,fp) == 0) goto err;
3430 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3431 zfree(c);
3432 return createObject(REDIS_STRING,val);
3433 err:
3434 zfree(c);
3435 sdsfree(val);
3436 return NULL;
3437 }
3438
3439 static robj *rdbLoadStringObject(FILE*fp) {
3440 int isencoded;
3441 uint32_t len;
3442 sds val;
3443
3444 len = rdbLoadLen(fp,&isencoded);
3445 if (isencoded) {
3446 switch(len) {
3447 case REDIS_RDB_ENC_INT8:
3448 case REDIS_RDB_ENC_INT16:
3449 case REDIS_RDB_ENC_INT32:
3450 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3451 case REDIS_RDB_ENC_LZF:
3452 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3453 default:
3454 redisAssert(0!=0);
3455 }
3456 }
3457
3458 if (len == REDIS_RDB_LENERR) return NULL;
3459 val = sdsnewlen(NULL,len);
3460 if (len && fread(val,len,1,fp) == 0) {
3461 sdsfree(val);
3462 return NULL;
3463 }
3464 return tryObjectSharing(createObject(REDIS_STRING,val));
3465 }
3466
3467 /* For information about double serialization check rdbSaveDoubleValue() */
3468 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3469 char buf[128];
3470 unsigned char len;
3471
3472 if (fread(&len,1,1,fp) == 0) return -1;
3473 switch(len) {
3474 case 255: *val = R_NegInf; return 0;
3475 case 254: *val = R_PosInf; return 0;
3476 case 253: *val = R_Nan; return 0;
3477 default:
3478 if (fread(buf,len,1,fp) == 0) return -1;
3479 buf[len] = '\0';
3480 sscanf(buf, "%lg", val);
3481 return 0;
3482 }
3483 }
3484
3485 /* Load a Redis object of the specified type from the specified file.
3486 * On success a newly allocated object is returned, otherwise NULL. */
3487 static robj *rdbLoadObject(int type, FILE *fp) {
3488 robj *o;
3489
3490 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3491 if (type == REDIS_STRING) {
3492 /* Read string value */
3493 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3494 tryObjectEncoding(o);
3495 } else if (type == REDIS_LIST || type == REDIS_SET) {
3496 /* Read list/set value */
3497 uint32_t listlen;
3498
3499 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3500 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3501 /* It's faster to expand the dict to the right size asap in order
3502 * to avoid rehashing */
3503 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3504 dictExpand(o->ptr,listlen);
3505 /* Load every single element of the list/set */
3506 while(listlen--) {
3507 robj *ele;
3508
3509 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3510 tryObjectEncoding(ele);
3511 if (type == REDIS_LIST) {
3512 listAddNodeTail((list*)o->ptr,ele);
3513 } else {
3514 dictAdd((dict*)o->ptr,ele,NULL);
3515 }
3516 }
3517 } else if (type == REDIS_ZSET) {
3518 /* Read list/set value */
3519 size_t zsetlen;
3520 zset *zs;
3521
3522 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3523 o = createZsetObject();
3524 zs = o->ptr;
3525 /* Load every single element of the list/set */
3526 while(zsetlen--) {
3527 robj *ele;
3528 double *score = zmalloc(sizeof(double));
3529
3530 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3531 tryObjectEncoding(ele);
3532 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3533 dictAdd(zs->dict,ele,score);
3534 zslInsert(zs->zsl,*score,ele);
3535 incrRefCount(ele); /* added to skiplist */
3536 }
3537 } else if (type == REDIS_HASH) {
3538 size_t hashlen;
3539
3540 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3541 o = createHashObject();
3542 /* Too many entries? Use an hash table. */
3543 if (hashlen > server.hash_max_zipmap_entries)
3544 convertToRealHash(o);
3545 /* Load every key/value, then set it into the zipmap or hash
3546 * table, as needed. */
3547 while(hashlen--) {
3548 robj *key, *val;
3549
3550 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3551 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3552 /* If we are using a zipmap and there are too big values
3553 * the object is converted to real hash table encoding. */
3554 if (o->encoding != REDIS_ENCODING_HT &&
3555 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3556 sdslen(val->ptr) > server.hash_max_zipmap_value))
3557 {
3558 convertToRealHash(o);
3559 }
3560
3561 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3562 unsigned char *zm = o->ptr;
3563
3564 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3565 val->ptr,sdslen(val->ptr),NULL);
3566 o->ptr = zm;
3567 decrRefCount(key);
3568 decrRefCount(val);
3569 } else {
3570 tryObjectEncoding(key);
3571 tryObjectEncoding(val);
3572 dictAdd((dict*)o->ptr,key,val);
3573 incrRefCount(key);
3574 incrRefCount(val);
3575 }
3576 }
3577 } else {
3578 redisAssert(0 != 0);
3579 }
3580 return o;
3581 }
3582
3583 static int rdbLoad(char *filename) {
3584 FILE *fp;
3585 robj *keyobj = NULL;
3586 uint32_t dbid;
3587 int type, retval, rdbver;
3588 dict *d = server.db[0].dict;
3589 redisDb *db = server.db+0;
3590 char buf[1024];
3591 time_t expiretime = -1, now = time(NULL);
3592 long long loadedkeys = 0;
3593
3594 fp = fopen(filename,"r");
3595 if (!fp) return REDIS_ERR;
3596 if (fread(buf,9,1,fp) == 0) goto eoferr;
3597 buf[9] = '\0';
3598 if (memcmp(buf,"REDIS",5) != 0) {
3599 fclose(fp);
3600 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3601 return REDIS_ERR;
3602 }
3603 rdbver = atoi(buf+5);
3604 if (rdbver != 1) {
3605 fclose(fp);
3606 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3607 return REDIS_ERR;
3608 }
3609 while(1) {
3610 robj *o;
3611
3612 /* Read type. */
3613 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3614 if (type == REDIS_EXPIRETIME) {
3615 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3616 /* We read the time so we need to read the object type again */
3617 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3618 }
3619 if (type == REDIS_EOF) break;
3620 /* Handle SELECT DB opcode as a special case */
3621 if (type == REDIS_SELECTDB) {
3622 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3623 goto eoferr;
3624 if (dbid >= (unsigned)server.dbnum) {
3625 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3626 exit(1);
3627 }
3628 db = server.db+dbid;
3629 d = db->dict;
3630 continue;
3631 }
3632 /* Read key */
3633 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3634 /* Read value */
3635 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3636 /* Add the new object in the hash table */
3637 retval = dictAdd(d,keyobj,o);
3638 if (retval == DICT_ERR) {
3639 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3640 exit(1);
3641 }
3642 /* Set the expire time if needed */
3643 if (expiretime != -1) {
3644 setExpire(db,keyobj,expiretime);
3645 /* Delete this key if already expired */
3646 if (expiretime < now) deleteKey(db,keyobj);
3647 expiretime = -1;
3648 }
3649 keyobj = o = NULL;
3650 /* Handle swapping while loading big datasets when VM is on */
3651 loadedkeys++;
3652 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3653 while (zmalloc_used_memory() > server.vm_max_memory) {
3654 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3655 }
3656 }
3657 }
3658 fclose(fp);
3659 return REDIS_OK;
3660
3661 eoferr: /* unexpected end of file is handled here with a fatal exit */
3662 if (keyobj) decrRefCount(keyobj);
3663 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3664 exit(1);
3665 return REDIS_ERR; /* Just to avoid warning */
3666 }
3667
3668 /*================================== Commands =============================== */
3669
3670 static void authCommand(redisClient *c) {
3671 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3672 c->authenticated = 1;
3673 addReply(c,shared.ok);
3674 } else {
3675 c->authenticated = 0;
3676 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3677 }
3678 }
3679
3680 static void pingCommand(redisClient *c) {
3681 addReply(c,shared.pong);
3682 }
3683
3684 static void echoCommand(redisClient *c) {
3685 addReplyBulkLen(c,c->argv[1]);
3686 addReply(c,c->argv[1]);
3687 addReply(c,shared.crlf);
3688 }
3689
3690 /*=================================== Strings =============================== */
3691
3692 static void setGenericCommand(redisClient *c, int nx) {
3693 int retval;
3694
3695 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3696 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3697 if (retval == DICT_ERR) {
3698 if (!nx) {
3699 /* If the key is about a swapped value, we want a new key object
3700 * to overwrite the old. So we delete the old key in the database.
3701 * This will also make sure that swap pages about the old object
3702 * will be marked as free. */
3703 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3704 incrRefCount(c->argv[1]);
3705 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3706 incrRefCount(c->argv[2]);
3707 } else {
3708 addReply(c,shared.czero);
3709 return;
3710 }
3711 } else {
3712 incrRefCount(c->argv[1]);
3713 incrRefCount(c->argv[2]);
3714 }
3715 server.dirty++;
3716 removeExpire(c->db,c->argv[1]);
3717 addReply(c, nx ? shared.cone : shared.ok);
3718 }
3719
3720 static void setCommand(redisClient *c) {
3721 setGenericCommand(c,0);
3722 }
3723
3724 static void setnxCommand(redisClient *c) {
3725 setGenericCommand(c,1);
3726 }
3727
3728 static int getGenericCommand(redisClient *c) {
3729 robj *o = lookupKeyRead(c->db,c->argv[1]);
3730
3731 if (o == NULL) {
3732 addReply(c,shared.nullbulk);
3733 return REDIS_OK;
3734 } else {
3735 if (o->type != REDIS_STRING) {
3736 addReply(c,shared.wrongtypeerr);
3737 return REDIS_ERR;
3738 } else {
3739 addReplyBulkLen(c,o);
3740 addReply(c,o);
3741 addReply(c,shared.crlf);
3742 return REDIS_OK;
3743 }
3744 }
3745 }
3746
3747 static void getCommand(redisClient *c) {
3748 getGenericCommand(c);
3749 }
3750
3751 static void getsetCommand(redisClient *c) {
3752 if (getGenericCommand(c) == REDIS_ERR) return;
3753 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3754 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3755 } else {
3756 incrRefCount(c->argv[1]);
3757 }
3758 incrRefCount(c->argv[2]);
3759 server.dirty++;
3760 removeExpire(c->db,c->argv[1]);
3761 }
3762
3763 static void mgetCommand(redisClient *c) {
3764 int j;
3765
3766 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3767 for (j = 1; j < c->argc; j++) {
3768 robj *o = lookupKeyRead(c->db,c->argv[j]);
3769 if (o == NULL) {
3770 addReply(c,shared.nullbulk);
3771 } else {
3772 if (o->type != REDIS_STRING) {
3773 addReply(c,shared.nullbulk);
3774 } else {
3775 addReplyBulkLen(c,o);
3776 addReply(c,o);
3777 addReply(c,shared.crlf);
3778 }
3779 }
3780 }
3781 }
3782
3783 static void msetGenericCommand(redisClient *c, int nx) {
3784 int j, busykeys = 0;
3785
3786 if ((c->argc % 2) == 0) {
3787 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3788 return;
3789 }
3790 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3791 * set nothing at all if at least one already key exists. */
3792 if (nx) {
3793 for (j = 1; j < c->argc; j += 2) {
3794 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3795 busykeys++;
3796 }
3797 }
3798 }
3799 if (busykeys) {
3800 addReply(c, shared.czero);
3801 return;
3802 }
3803
3804 for (j = 1; j < c->argc; j += 2) {
3805 int retval;
3806
3807 tryObjectEncoding(c->argv[j+1]);
3808 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3809 if (retval == DICT_ERR) {
3810 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3811 incrRefCount(c->argv[j+1]);
3812 } else {
3813 incrRefCount(c->argv[j]);
3814 incrRefCount(c->argv[j+1]);
3815 }
3816 removeExpire(c->db,c->argv[j]);
3817 }
3818 server.dirty += (c->argc-1)/2;
3819 addReply(c, nx ? shared.cone : shared.ok);
3820 }
3821
3822 static void msetCommand(redisClient *c) {
3823 msetGenericCommand(c,0);
3824 }
3825
3826 static void msetnxCommand(redisClient *c) {
3827 msetGenericCommand(c,1);
3828 }
3829
3830 static void incrDecrCommand(redisClient *c, long long incr) {
3831 long long value;
3832 int retval;
3833 robj *o;
3834
3835 o = lookupKeyWrite(c->db,c->argv[1]);
3836 if (o == NULL) {
3837 value = 0;
3838 } else {
3839 if (o->type != REDIS_STRING) {
3840 value = 0;
3841 } else {
3842 char *eptr;
3843
3844 if (o->encoding == REDIS_ENCODING_RAW)
3845 value = strtoll(o->ptr, &eptr, 10);
3846 else if (o->encoding == REDIS_ENCODING_INT)
3847 value = (long)o->ptr;
3848 else
3849 redisAssert(1 != 1);
3850 }
3851 }
3852
3853 value += incr;
3854 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3855 tryObjectEncoding(o);
3856 retval = dictAdd(c->db->dict,c->argv[1],o);
3857 if (retval == DICT_ERR) {
3858 dictReplace(c->db->dict,c->argv[1],o);
3859 removeExpire(c->db,c->argv[1]);
3860 } else {
3861 incrRefCount(c->argv[1]);
3862 }
3863 server.dirty++;
3864 addReply(c,shared.colon);
3865 addReply(c,o);
3866 addReply(c,shared.crlf);
3867 }
3868
3869 static void incrCommand(redisClient *c) {
3870 incrDecrCommand(c,1);
3871 }
3872
3873 static void decrCommand(redisClient *c) {
3874 incrDecrCommand(c,-1);
3875 }
3876
3877 static void incrbyCommand(redisClient *c) {
3878 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3879 incrDecrCommand(c,incr);
3880 }
3881
3882 static void decrbyCommand(redisClient *c) {
3883 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3884 incrDecrCommand(c,-incr);
3885 }
3886
3887 static void appendCommand(redisClient *c) {
3888 int retval;
3889 size_t totlen;
3890 robj *o;
3891
3892 o = lookupKeyWrite(c->db,c->argv[1]);
3893 if (o == NULL) {
3894 /* Create the key */
3895 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3896 incrRefCount(c->argv[1]);
3897 incrRefCount(c->argv[2]);
3898 totlen = stringObjectLen(c->argv[2]);
3899 } else {
3900 dictEntry *de;
3901
3902 de = dictFind(c->db->dict,c->argv[1]);
3903 assert(de != NULL);
3904
3905 o = dictGetEntryVal(de);
3906 if (o->type != REDIS_STRING) {
3907 addReply(c,shared.wrongtypeerr);
3908 return;
3909 }
3910 /* If the object is specially encoded or shared we have to make
3911 * a copy */
3912 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
3913 robj *decoded = getDecodedObject(o);
3914
3915 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
3916 decrRefCount(decoded);
3917 dictReplace(c->db->dict,c->argv[1],o);
3918 }
3919 /* APPEND! */
3920 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
3921 o->ptr = sdscatlen(o->ptr,
3922 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
3923 } else {
3924 o->ptr = sdscatprintf(o->ptr, "%ld",
3925 (unsigned long) c->argv[2]->ptr);
3926 }
3927 totlen = sdslen(o->ptr);
3928 }
3929 server.dirty++;
3930 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
3931 }
3932
3933 static void substrCommand(redisClient *c) {
3934 robj *o;
3935 long start = atoi(c->argv[2]->ptr);
3936 long end = atoi(c->argv[3]->ptr);
3937
3938 o = lookupKeyRead(c->db,c->argv[1]);
3939 if (o == NULL) {
3940 addReply(c,shared.nullbulk);
3941 } else {
3942 if (o->type != REDIS_STRING) {
3943 addReply(c,shared.wrongtypeerr);
3944 } else {
3945 size_t rangelen, strlen;
3946 sds range;
3947
3948 o = getDecodedObject(o);
3949 strlen = sdslen(o->ptr);
3950
3951 /* convert negative indexes */
3952 if (start < 0) start = strlen+start;
3953 if (end < 0) end = strlen+end;
3954 if (start < 0) start = 0;
3955 if (end < 0) end = 0;
3956
3957 /* indexes sanity checks */
3958 if (start > end || (size_t)start >= strlen) {
3959 /* Out of range start or start > end result in null reply */
3960 addReply(c,shared.nullbulk);
3961 decrRefCount(o);
3962 return;
3963 }
3964 if ((size_t)end >= strlen) end = strlen-1;
3965 rangelen = (end-start)+1;
3966
3967 /* Return the result */
3968 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
3969 range = sdsnewlen((char*)o->ptr+start,rangelen);
3970 addReplySds(c,range);
3971 addReply(c,shared.crlf);
3972 decrRefCount(o);
3973 }
3974 }
3975 }
3976
3977 /* ========================= Type agnostic commands ========================= */
3978
3979 static void delCommand(redisClient *c) {
3980 int deleted = 0, j;
3981
3982 for (j = 1; j < c->argc; j++) {
3983 if (deleteKey(c->db,c->argv[j])) {
3984 server.dirty++;
3985 deleted++;
3986 }
3987 }
3988 switch(deleted) {
3989 case 0:
3990 addReply(c,shared.czero);
3991 break;
3992 case 1:
3993 addReply(c,shared.cone);
3994 break;
3995 default:
3996 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3997 break;
3998 }
3999 }
4000
4001 static void existsCommand(redisClient *c) {
4002 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4003 }
4004
4005 static void selectCommand(redisClient *c) {
4006 int id = atoi(c->argv[1]->ptr);
4007
4008 if (selectDb(c,id) == REDIS_ERR) {
4009 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4010 } else {
4011 addReply(c,shared.ok);
4012 }
4013 }
4014
4015 static void randomkeyCommand(redisClient *c) {
4016 dictEntry *de;
4017
4018 while(1) {
4019 de = dictGetRandomKey(c->db->dict);
4020 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4021 }
4022 if (de == NULL) {
4023 addReply(c,shared.plus);
4024 addReply(c,shared.crlf);
4025 } else {
4026 addReply(c,shared.plus);
4027 addReply(c,dictGetEntryKey(de));
4028 addReply(c,shared.crlf);
4029 }
4030 }
4031
4032 static void keysCommand(redisClient *c) {
4033 dictIterator *di;
4034 dictEntry *de;
4035 sds pattern = c->argv[1]->ptr;
4036 int plen = sdslen(pattern);
4037 unsigned long numkeys = 0;
4038 robj *lenobj = createObject(REDIS_STRING,NULL);
4039
4040 di = dictGetIterator(c->db->dict);
4041 addReply(c,lenobj);
4042 decrRefCount(lenobj);
4043 while((de = dictNext(di)) != NULL) {
4044 robj *keyobj = dictGetEntryKey(de);
4045
4046 sds key = keyobj->ptr;
4047 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4048 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4049 if (expireIfNeeded(c->db,keyobj) == 0) {
4050 addReplyBulkLen(c,keyobj);
4051 addReply(c,keyobj);
4052 addReply(c,shared.crlf);
4053 numkeys++;
4054 }
4055 }
4056 }
4057 dictReleaseIterator(di);
4058 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4059 }
4060
4061 static void dbsizeCommand(redisClient *c) {
4062 addReplySds(c,
4063 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4064 }
4065
4066 static void lastsaveCommand(redisClient *c) {
4067 addReplySds(c,
4068 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4069 }
4070
4071 static void typeCommand(redisClient *c) {
4072 robj *o;
4073 char *type;
4074
4075 o = lookupKeyRead(c->db,c->argv[1]);
4076 if (o == NULL) {
4077 type = "+none";
4078 } else {
4079 switch(o->type) {
4080 case REDIS_STRING: type = "+string"; break;
4081 case REDIS_LIST: type = "+list"; break;
4082 case REDIS_SET: type = "+set"; break;
4083 case REDIS_ZSET: type = "+zset"; break;
4084 case REDIS_HASH: type = "+hash"; break;
4085 default: type = "+unknown"; break;
4086 }
4087 }
4088 addReplySds(c,sdsnew(type));
4089 addReply(c,shared.crlf);
4090 }
4091
4092 static void saveCommand(redisClient *c) {
4093 if (server.bgsavechildpid != -1) {
4094 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4095 return;
4096 }
4097 if (rdbSave(server.dbfilename) == REDIS_OK) {
4098 addReply(c,shared.ok);
4099 } else {
4100 addReply(c,shared.err);
4101 }
4102 }
4103
4104 static void bgsaveCommand(redisClient *c) {
4105 if (server.bgsavechildpid != -1) {
4106 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4107 return;
4108 }
4109 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4110 char *status = "+Background saving started\r\n";
4111 addReplySds(c,sdsnew(status));
4112 } else {
4113 addReply(c,shared.err);
4114 }
4115 }
4116
4117 static void shutdownCommand(redisClient *c) {
4118 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4119 /* Kill the saving child if there is a background saving in progress.
4120 We want to avoid race conditions, for instance our saving child may
4121 overwrite the synchronous saving did by SHUTDOWN. */
4122 if (server.bgsavechildpid != -1) {
4123 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4124 kill(server.bgsavechildpid,SIGKILL);
4125 rdbRemoveTempFile(server.bgsavechildpid);
4126 }
4127 if (server.appendonly) {
4128 /* Append only file: fsync() the AOF and exit */
4129 fsync(server.appendfd);
4130 if (server.vm_enabled) unlink(server.vm_swap_file);
4131 exit(0);
4132 } else {
4133 /* Snapshotting. Perform a SYNC SAVE and exit */
4134 if (rdbSave(server.dbfilename) == REDIS_OK) {
4135 if (server.daemonize)
4136 unlink(server.pidfile);
4137 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4138 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4139 if (server.vm_enabled) unlink(server.vm_swap_file);
4140 exit(0);
4141 } else {
4142 /* Ooops.. error saving! The best we can do is to continue operating.
4143 * Note that if there was a background saving process, in the next
4144 * cron() Redis will be notified that the background saving aborted,
4145 * handling special stuff like slaves pending for synchronization... */
4146 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4147 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4148 }
4149 }
4150 }
4151
4152 static void renameGenericCommand(redisClient *c, int nx) {
4153 robj *o;
4154
4155 /* To use the same key as src and dst is probably an error */
4156 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4157 addReply(c,shared.sameobjecterr);
4158 return;
4159 }
4160
4161 o = lookupKeyWrite(c->db,c->argv[1]);
4162 if (o == NULL) {
4163 addReply(c,shared.nokeyerr);
4164 return;
4165 }
4166 incrRefCount(o);
4167 deleteIfVolatile(c->db,c->argv[2]);
4168 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4169 if (nx) {
4170 decrRefCount(o);
4171 addReply(c,shared.czero);
4172 return;
4173 }
4174 dictReplace(c->db->dict,c->argv[2],o);
4175 } else {
4176 incrRefCount(c->argv[2]);
4177 }
4178 deleteKey(c->db,c->argv[1]);
4179 server.dirty++;
4180 addReply(c,nx ? shared.cone : shared.ok);
4181 }
4182
4183 static void renameCommand(redisClient *c) {
4184 renameGenericCommand(c,0);
4185 }
4186
4187 static void renamenxCommand(redisClient *c) {
4188 renameGenericCommand(c,1);
4189 }
4190
4191 static void moveCommand(redisClient *c) {
4192 robj *o;
4193 redisDb *src, *dst;
4194 int srcid;
4195
4196 /* Obtain source and target DB pointers */
4197 src = c->db;
4198 srcid = c->db->id;
4199 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4200 addReply(c,shared.outofrangeerr);
4201 return;
4202 }
4203 dst = c->db;
4204 selectDb(c,srcid); /* Back to the source DB */
4205
4206 /* If the user is moving using as target the same
4207 * DB as the source DB it is probably an error. */
4208 if (src == dst) {
4209 addReply(c,shared.sameobjecterr);
4210 return;
4211 }
4212
4213 /* Check if the element exists and get a reference */
4214 o = lookupKeyWrite(c->db,c->argv[1]);
4215 if (!o) {
4216 addReply(c,shared.czero);
4217 return;
4218 }
4219
4220 /* Try to add the element to the target DB */
4221 deleteIfVolatile(dst,c->argv[1]);
4222 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4223 addReply(c,shared.czero);
4224 return;
4225 }
4226 incrRefCount(c->argv[1]);
4227 incrRefCount(o);
4228
4229 /* OK! key moved, free the entry in the source DB */
4230 deleteKey(src,c->argv[1]);
4231 server.dirty++;
4232 addReply(c,shared.cone);
4233 }
4234
4235 /* =================================== Lists ================================ */
4236 static void pushGenericCommand(redisClient *c, int where) {
4237 robj *lobj;
4238 list *list;
4239
4240 lobj = lookupKeyWrite(c->db,c->argv[1]);
4241 if (lobj == NULL) {
4242 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4243 addReply(c,shared.cone);
4244 return;
4245 }
4246 lobj = createListObject();
4247 list = lobj->ptr;
4248 if (where == REDIS_HEAD) {
4249 listAddNodeHead(list,c->argv[2]);
4250 } else {
4251 listAddNodeTail(list,c->argv[2]);
4252 }
4253 dictAdd(c->db->dict,c->argv[1],lobj);
4254 incrRefCount(c->argv[1]);
4255 incrRefCount(c->argv[2]);
4256 } else {
4257 if (lobj->type != REDIS_LIST) {
4258 addReply(c,shared.wrongtypeerr);
4259 return;
4260 }
4261 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4262 addReply(c,shared.cone);
4263 return;
4264 }
4265 list = lobj->ptr;
4266 if (where == REDIS_HEAD) {
4267 listAddNodeHead(list,c->argv[2]);
4268 } else {
4269 listAddNodeTail(list,c->argv[2]);
4270 }
4271 incrRefCount(c->argv[2]);
4272 }
4273 server.dirty++;
4274 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4275 }
4276
4277 static void lpushCommand(redisClient *c) {
4278 pushGenericCommand(c,REDIS_HEAD);
4279 }
4280
4281 static void rpushCommand(redisClient *c) {
4282 pushGenericCommand(c,REDIS_TAIL);
4283 }
4284
4285 static void llenCommand(redisClient *c) {
4286 robj *o;
4287 list *l;
4288
4289 o = lookupKeyRead(c->db,c->argv[1]);
4290 if (o == NULL) {
4291 addReply(c,shared.czero);
4292 return;
4293 } else {
4294 if (o->type != REDIS_LIST) {
4295 addReply(c,shared.wrongtypeerr);
4296 } else {
4297 l = o->ptr;
4298 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
4299 }
4300 }
4301 }
4302
4303 static void lindexCommand(redisClient *c) {
4304 robj *o;
4305 int index = atoi(c->argv[2]->ptr);
4306
4307 o = lookupKeyRead(c->db,c->argv[1]);
4308 if (o == NULL) {
4309 addReply(c,shared.nullbulk);
4310 } else {
4311 if (o->type != REDIS_LIST) {
4312 addReply(c,shared.wrongtypeerr);
4313 } else {
4314 list *list = o->ptr;
4315 listNode *ln;
4316
4317 ln = listIndex(list, index);
4318 if (ln == NULL) {
4319 addReply(c,shared.nullbulk);
4320 } else {
4321 robj *ele = listNodeValue(ln);
4322 addReplyBulkLen(c,ele);
4323 addReply(c,ele);
4324 addReply(c,shared.crlf);
4325 }
4326 }
4327 }
4328 }
4329
4330 static void lsetCommand(redisClient *c) {
4331 robj *o;
4332 int index = atoi(c->argv[2]->ptr);
4333
4334 o = lookupKeyWrite(c->db,c->argv[1]);
4335 if (o == NULL) {
4336 addReply(c,shared.nokeyerr);
4337 } else {
4338 if (o->type != REDIS_LIST) {
4339 addReply(c,shared.wrongtypeerr);
4340 } else {
4341 list *list = o->ptr;
4342 listNode *ln;
4343
4344 ln = listIndex(list, index);
4345 if (ln == NULL) {
4346 addReply(c,shared.outofrangeerr);
4347 } else {
4348 robj *ele = listNodeValue(ln);
4349
4350 decrRefCount(ele);
4351 listNodeValue(ln) = c->argv[3];
4352 incrRefCount(c->argv[3]);
4353 addReply(c,shared.ok);
4354 server.dirty++;
4355 }
4356 }
4357 }
4358 }
4359
4360 static void popGenericCommand(redisClient *c, int where) {
4361 robj *o;
4362
4363 o = lookupKeyWrite(c->db,c->argv[1]);
4364 if (o == NULL) {
4365 addReply(c,shared.nullbulk);
4366 } else {
4367 if (o->type != REDIS_LIST) {
4368 addReply(c,shared.wrongtypeerr);
4369 } else {
4370 list *list = o->ptr;
4371 listNode *ln;
4372
4373 if (where == REDIS_HEAD)
4374 ln = listFirst(list);
4375 else
4376 ln = listLast(list);
4377
4378 if (ln == NULL) {
4379 addReply(c,shared.nullbulk);
4380 } else {
4381 robj *ele = listNodeValue(ln);
4382 addReplyBulkLen(c,ele);
4383 addReply(c,ele);
4384 addReply(c,shared.crlf);
4385 listDelNode(list,ln);
4386 server.dirty++;
4387 }
4388 }
4389 }
4390 }
4391
4392 static void lpopCommand(redisClient *c) {
4393 popGenericCommand(c,REDIS_HEAD);
4394 }
4395
4396 static void rpopCommand(redisClient *c) {
4397 popGenericCommand(c,REDIS_TAIL);
4398 }
4399
4400 static void lrangeCommand(redisClient *c) {
4401 robj *o;
4402 int start = atoi(c->argv[2]->ptr);
4403 int end = atoi(c->argv[3]->ptr);
4404
4405 o = lookupKeyRead(c->db,c->argv[1]);
4406 if (o == NULL) {
4407 addReply(c,shared.nullmultibulk);
4408 } else {
4409 if (o->type != REDIS_LIST) {
4410 addReply(c,shared.wrongtypeerr);
4411 } else {
4412 list *list = o->ptr;
4413 listNode *ln;
4414 int llen = listLength(list);
4415 int rangelen, j;
4416 robj *ele;
4417
4418 /* convert negative indexes */
4419 if (start < 0) start = llen+start;
4420 if (end < 0) end = llen+end;
4421 if (start < 0) start = 0;
4422 if (end < 0) end = 0;
4423
4424 /* indexes sanity checks */
4425 if (start > end || start >= llen) {
4426 /* Out of range start or start > end result in empty list */
4427 addReply(c,shared.emptymultibulk);
4428 return;
4429 }
4430 if (end >= llen) end = llen-1;
4431 rangelen = (end-start)+1;
4432
4433 /* Return the result in form of a multi-bulk reply */
4434 ln = listIndex(list, start);
4435 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4436 for (j = 0; j < rangelen; j++) {
4437 ele = listNodeValue(ln);
4438 addReplyBulkLen(c,ele);
4439 addReply(c,ele);
4440 addReply(c,shared.crlf);
4441 ln = ln->next;
4442 }
4443 }
4444 }
4445 }
4446
4447 static void ltrimCommand(redisClient *c) {
4448 robj *o;
4449 int start = atoi(c->argv[2]->ptr);
4450 int end = atoi(c->argv[3]->ptr);
4451
4452 o = lookupKeyWrite(c->db,c->argv[1]);
4453 if (o == NULL) {
4454 addReply(c,shared.ok);
4455 } else {
4456 if (o->type != REDIS_LIST) {
4457 addReply(c,shared.wrongtypeerr);
4458 } else {
4459 list *list = o->ptr;
4460 listNode *ln;
4461 int llen = listLength(list);
4462 int j, ltrim, rtrim;
4463
4464 /* convert negative indexes */
4465 if (start < 0) start = llen+start;
4466 if (end < 0) end = llen+end;
4467 if (start < 0) start = 0;
4468 if (end < 0) end = 0;
4469
4470 /* indexes sanity checks */
4471 if (start > end || start >= llen) {
4472 /* Out of range start or start > end result in empty list */
4473 ltrim = llen;
4474 rtrim = 0;
4475 } else {
4476 if (end >= llen) end = llen-1;
4477 ltrim = start;
4478 rtrim = llen-end-1;
4479 }
4480
4481 /* Remove list elements to perform the trim */
4482 for (j = 0; j < ltrim; j++) {
4483 ln = listFirst(list);
4484 listDelNode(list,ln);
4485 }
4486 for (j = 0; j < rtrim; j++) {
4487 ln = listLast(list);
4488 listDelNode(list,ln);
4489 }
4490 server.dirty++;
4491 addReply(c,shared.ok);
4492 }
4493 }
4494 }
4495
4496 static void lremCommand(redisClient *c) {
4497 robj *o;
4498
4499 o = lookupKeyWrite(c->db,c->argv[1]);
4500 if (o == NULL) {
4501 addReply(c,shared.czero);
4502 } else {
4503 if (o->type != REDIS_LIST) {
4504 addReply(c,shared.wrongtypeerr);
4505 } else {
4506 list *list = o->ptr;
4507 listNode *ln, *next;
4508 int toremove = atoi(c->argv[2]->ptr);
4509 int removed = 0;
4510 int fromtail = 0;
4511
4512 if (toremove < 0) {
4513 toremove = -toremove;
4514 fromtail = 1;
4515 }
4516 ln = fromtail ? list->tail : list->head;
4517 while (ln) {
4518 robj *ele = listNodeValue(ln);
4519
4520 next = fromtail ? ln->prev : ln->next;
4521 if (compareStringObjects(ele,c->argv[3]) == 0) {
4522 listDelNode(list,ln);
4523 server.dirty++;
4524 removed++;
4525 if (toremove && removed == toremove) break;
4526 }
4527 ln = next;
4528 }
4529 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4530 }
4531 }
4532 }
4533
4534 /* This is the semantic of this command:
4535 * RPOPLPUSH srclist dstlist:
4536 * IF LLEN(srclist) > 0
4537 * element = RPOP srclist
4538 * LPUSH dstlist element
4539 * RETURN element
4540 * ELSE
4541 * RETURN nil
4542 * END
4543 * END
4544 *
4545 * The idea is to be able to get an element from a list in a reliable way
4546 * since the element is not just returned but pushed against another list
4547 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4548 */
4549 static void rpoplpushcommand(redisClient *c) {
4550 robj *sobj;
4551
4552 sobj = lookupKeyWrite(c->db,c->argv[1]);
4553 if (sobj == NULL) {
4554 addReply(c,shared.nullbulk);
4555 } else {
4556 if (sobj->type != REDIS_LIST) {
4557 addReply(c,shared.wrongtypeerr);
4558 } else {
4559 list *srclist = sobj->ptr;
4560 listNode *ln = listLast(srclist);
4561
4562 if (ln == NULL) {
4563 addReply(c,shared.nullbulk);
4564 } else {
4565 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4566 robj *ele = listNodeValue(ln);
4567 list *dstlist;
4568
4569 if (dobj && dobj->type != REDIS_LIST) {
4570 addReply(c,shared.wrongtypeerr);
4571 return;
4572 }
4573
4574 /* Add the element to the target list (unless it's directly
4575 * passed to some BLPOP-ing client */
4576 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4577 if (dobj == NULL) {
4578 /* Create the list if the key does not exist */
4579 dobj = createListObject();
4580 dictAdd(c->db->dict,c->argv[2],dobj);
4581 incrRefCount(c->argv[2]);
4582 }
4583 dstlist = dobj->ptr;
4584 listAddNodeHead(dstlist,ele);
4585 incrRefCount(ele);
4586 }
4587
4588 /* Send the element to the client as reply as well */
4589 addReplyBulkLen(c,ele);
4590 addReply(c,ele);
4591 addReply(c,shared.crlf);
4592
4593 /* Finally remove the element from the source list */
4594 listDelNode(srclist,ln);
4595 server.dirty++;
4596 }
4597 }
4598 }
4599 }
4600
4601
4602 /* ==================================== Sets ================================ */
4603
4604 static void saddCommand(redisClient *c) {
4605 robj *set;
4606
4607 set = lookupKeyWrite(c->db,c->argv[1]);
4608 if (set == NULL) {
4609 set = createSetObject();
4610 dictAdd(c->db->dict,c->argv[1],set);
4611 incrRefCount(c->argv[1]);
4612 } else {
4613 if (set->type != REDIS_SET) {
4614 addReply(c,shared.wrongtypeerr);
4615 return;
4616 }
4617 }
4618 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4619 incrRefCount(c->argv[2]);
4620 server.dirty++;
4621 addReply(c,shared.cone);
4622 } else {
4623 addReply(c,shared.czero);
4624 }
4625 }
4626
4627 static void sremCommand(redisClient *c) {
4628 robj *set;
4629
4630 set = lookupKeyWrite(c->db,c->argv[1]);
4631 if (set == NULL) {
4632 addReply(c,shared.czero);
4633 } else {
4634 if (set->type != REDIS_SET) {
4635 addReply(c,shared.wrongtypeerr);
4636 return;
4637 }
4638 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4639 server.dirty++;
4640 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4641 addReply(c,shared.cone);
4642 } else {
4643 addReply(c,shared.czero);
4644 }
4645 }
4646 }
4647
4648 static void smoveCommand(redisClient *c) {
4649 robj *srcset, *dstset;
4650
4651 srcset = lookupKeyWrite(c->db,c->argv[1]);
4652 dstset = lookupKeyWrite(c->db,c->argv[2]);
4653
4654 /* If the source key does not exist return 0, if it's of the wrong type
4655 * raise an error */
4656 if (srcset == NULL || srcset->type != REDIS_SET) {
4657 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4658 return;
4659 }
4660 /* Error if the destination key is not a set as well */
4661 if (dstset && dstset->type != REDIS_SET) {
4662 addReply(c,shared.wrongtypeerr);
4663 return;
4664 }
4665 /* Remove the element from the source set */
4666 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4667 /* Key not found in the src set! return zero */
4668 addReply(c,shared.czero);
4669 return;
4670 }
4671 server.dirty++;
4672 /* Add the element to the destination set */
4673 if (!dstset) {
4674 dstset = createSetObject();
4675 dictAdd(c->db->dict,c->argv[2],dstset);
4676 incrRefCount(c->argv[2]);
4677 }
4678 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4679 incrRefCount(c->argv[3]);
4680 addReply(c,shared.cone);
4681 }
4682
4683 static void sismemberCommand(redisClient *c) {
4684 robj *set;
4685
4686 set = lookupKeyRead(c->db,c->argv[1]);
4687 if (set == NULL) {
4688 addReply(c,shared.czero);
4689 } else {
4690 if (set->type != REDIS_SET) {
4691 addReply(c,shared.wrongtypeerr);
4692 return;
4693 }
4694 if (dictFind(set->ptr,c->argv[2]))
4695 addReply(c,shared.cone);
4696 else
4697 addReply(c,shared.czero);
4698 }
4699 }
4700
4701 static void scardCommand(redisClient *c) {
4702 robj *o;
4703 dict *s;
4704
4705 o = lookupKeyRead(c->db,c->argv[1]);
4706 if (o == NULL) {
4707 addReply(c,shared.czero);
4708 return;
4709 } else {
4710 if (o->type != REDIS_SET) {
4711 addReply(c,shared.wrongtypeerr);
4712 } else {
4713 s = o->ptr;
4714 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4715 dictSize(s)));
4716 }
4717 }
4718 }
4719
4720 static void spopCommand(redisClient *c) {
4721 robj *set;
4722 dictEntry *de;
4723
4724 set = lookupKeyWrite(c->db,c->argv[1]);
4725 if (set == NULL) {
4726 addReply(c,shared.nullbulk);
4727 } else {
4728 if (set->type != REDIS_SET) {
4729 addReply(c,shared.wrongtypeerr);
4730 return;
4731 }
4732 de = dictGetRandomKey(set->ptr);
4733 if (de == NULL) {
4734 addReply(c,shared.nullbulk);
4735 } else {
4736 robj *ele = dictGetEntryKey(de);
4737
4738 addReplyBulkLen(c,ele);
4739 addReply(c,ele);
4740 addReply(c,shared.crlf);
4741 dictDelete(set->ptr,ele);
4742 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4743 server.dirty++;
4744 }
4745 }
4746 }
4747
4748 static void srandmemberCommand(redisClient *c) {
4749 robj *set;
4750 dictEntry *de;
4751
4752 set = lookupKeyRead(c->db,c->argv[1]);
4753 if (set == NULL) {
4754 addReply(c,shared.nullbulk);
4755 } else {
4756 if (set->type != REDIS_SET) {
4757 addReply(c,shared.wrongtypeerr);
4758 return;
4759 }
4760 de = dictGetRandomKey(set->ptr);
4761 if (de == NULL) {
4762 addReply(c,shared.nullbulk);
4763 } else {
4764 robj *ele = dictGetEntryKey(de);
4765
4766 addReplyBulkLen(c,ele);
4767 addReply(c,ele);
4768 addReply(c,shared.crlf);
4769 }
4770 }
4771 }
4772
4773 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4774 dict **d1 = (void*) s1, **d2 = (void*) s2;
4775
4776 return dictSize(*d1)-dictSize(*d2);
4777 }
4778
4779 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4780 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4781 dictIterator *di;
4782 dictEntry *de;
4783 robj *lenobj = NULL, *dstset = NULL;
4784 unsigned long j, cardinality = 0;
4785
4786 for (j = 0; j < setsnum; j++) {
4787 robj *setobj;
4788
4789 setobj = dstkey ?
4790 lookupKeyWrite(c->db,setskeys[j]) :
4791 lookupKeyRead(c->db,setskeys[j]);
4792 if (!setobj) {
4793 zfree(dv);
4794 if (dstkey) {
4795 if (deleteKey(c->db,dstkey))
4796 server.dirty++;
4797 addReply(c,shared.czero);
4798 } else {
4799 addReply(c,shared.nullmultibulk);
4800 }
4801 return;
4802 }
4803 if (setobj->type != REDIS_SET) {
4804 zfree(dv);
4805 addReply(c,shared.wrongtypeerr);
4806 return;
4807 }
4808 dv[j] = setobj->ptr;
4809 }
4810 /* Sort sets from the smallest to largest, this will improve our
4811 * algorithm's performace */
4812 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4813
4814 /* The first thing we should output is the total number of elements...
4815 * since this is a multi-bulk write, but at this stage we don't know
4816 * the intersection set size, so we use a trick, append an empty object
4817 * to the output list and save the pointer to later modify it with the
4818 * right length */
4819 if (!dstkey) {
4820 lenobj = createObject(REDIS_STRING,NULL);
4821 addReply(c,lenobj);
4822 decrRefCount(lenobj);
4823 } else {
4824 /* If we have a target key where to store the resulting set
4825 * create this key with an empty set inside */
4826 dstset = createSetObject();
4827 }
4828
4829 /* Iterate all the elements of the first (smallest) set, and test
4830 * the element against all the other sets, if at least one set does
4831 * not include the element it is discarded */
4832 di = dictGetIterator(dv[0]);
4833
4834 while((de = dictNext(di)) != NULL) {
4835 robj *ele;
4836
4837 for (j = 1; j < setsnum; j++)
4838 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4839 if (j != setsnum)
4840 continue; /* at least one set does not contain the member */
4841 ele = dictGetEntryKey(de);
4842 if (!dstkey) {
4843 addReplyBulkLen(c,ele);
4844 addReply(c,ele);
4845 addReply(c,shared.crlf);
4846 cardinality++;
4847 } else {
4848 dictAdd(dstset->ptr,ele,NULL);
4849 incrRefCount(ele);
4850 }
4851 }
4852 dictReleaseIterator(di);
4853
4854 if (dstkey) {
4855 /* Store the resulting set into the target */
4856 deleteKey(c->db,dstkey);
4857 dictAdd(c->db->dict,dstkey,dstset);
4858 incrRefCount(dstkey);
4859 }
4860
4861 if (!dstkey) {
4862 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4863 } else {
4864 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4865 dictSize((dict*)dstset->ptr)));
4866 server.dirty++;
4867 }
4868 zfree(dv);
4869 }
4870
4871 static void sinterCommand(redisClient *c) {
4872 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4873 }
4874
4875 static void sinterstoreCommand(redisClient *c) {
4876 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4877 }
4878
4879 #define REDIS_OP_UNION 0
4880 #define REDIS_OP_DIFF 1
4881 #define REDIS_OP_INTER 2
4882
4883 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4884 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4885 dictIterator *di;
4886 dictEntry *de;
4887 robj *dstset = NULL;
4888 int j, cardinality = 0;
4889
4890 for (j = 0; j < setsnum; j++) {
4891 robj *setobj;
4892
4893 setobj = dstkey ?
4894 lookupKeyWrite(c->db,setskeys[j]) :
4895 lookupKeyRead(c->db,setskeys[j]);
4896 if (!setobj) {
4897 dv[j] = NULL;
4898 continue;
4899 }
4900 if (setobj->type != REDIS_SET) {
4901 zfree(dv);
4902 addReply(c,shared.wrongtypeerr);
4903 return;
4904 }
4905 dv[j] = setobj->ptr;
4906 }
4907
4908 /* We need a temp set object to store our union. If the dstkey
4909 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4910 * this set object will be the resulting object to set into the target key*/
4911 dstset = createSetObject();
4912
4913 /* Iterate all the elements of all the sets, add every element a single
4914 * time to the result set */
4915 for (j = 0; j < setsnum; j++) {
4916 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4917 if (!dv[j]) continue; /* non existing keys are like empty sets */
4918
4919 di = dictGetIterator(dv[j]);
4920
4921 while((de = dictNext(di)) != NULL) {
4922 robj *ele;
4923
4924 /* dictAdd will not add the same element multiple times */
4925 ele = dictGetEntryKey(de);
4926 if (op == REDIS_OP_UNION || j == 0) {
4927 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4928 incrRefCount(ele);
4929 cardinality++;
4930 }
4931 } else if (op == REDIS_OP_DIFF) {
4932 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4933 cardinality--;
4934 }
4935 }
4936 }
4937 dictReleaseIterator(di);
4938
4939 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
4940 }
4941
4942 /* Output the content of the resulting set, if not in STORE mode */
4943 if (!dstkey) {
4944 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4945 di = dictGetIterator(dstset->ptr);
4946 while((de = dictNext(di)) != NULL) {
4947 robj *ele;
4948
4949 ele = dictGetEntryKey(de);
4950 addReplyBulkLen(c,ele);
4951 addReply(c,ele);
4952 addReply(c,shared.crlf);
4953 }
4954 dictReleaseIterator(di);
4955 } else {
4956 /* If we have a target key where to store the resulting set
4957 * create this key with the result set inside */
4958 deleteKey(c->db,dstkey);
4959 dictAdd(c->db->dict,dstkey,dstset);
4960 incrRefCount(dstkey);
4961 }
4962
4963 /* Cleanup */
4964 if (!dstkey) {
4965 decrRefCount(dstset);
4966 } else {
4967 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
4968 dictSize((dict*)dstset->ptr)));
4969 server.dirty++;
4970 }
4971 zfree(dv);
4972 }
4973
4974 static void sunionCommand(redisClient *c) {
4975 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4976 }
4977
4978 static void sunionstoreCommand(redisClient *c) {
4979 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4980 }
4981
4982 static void sdiffCommand(redisClient *c) {
4983 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4984 }
4985
4986 static void sdiffstoreCommand(redisClient *c) {
4987 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4988 }
4989
4990 /* ==================================== ZSets =============================== */
4991
4992 /* ZSETs are ordered sets using two data structures to hold the same elements
4993 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4994 * data structure.
4995 *
4996 * The elements are added to an hash table mapping Redis objects to scores.
4997 * At the same time the elements are added to a skip list mapping scores
4998 * to Redis objects (so objects are sorted by scores in this "view"). */
4999
5000 /* This skiplist implementation is almost a C translation of the original
5001 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5002 * Alternative to Balanced Trees", modified in three ways:
5003 * a) this implementation allows for repeated values.
5004 * b) the comparison is not just by key (our 'score') but by satellite data.
5005 * c) there is a back pointer, so it's a doubly linked list with the back
5006 * pointers being only at "level 1". This allows to traverse the list
5007 * from tail to head, useful for ZREVRANGE. */
5008
5009 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5010 zskiplistNode *zn = zmalloc(sizeof(*zn));
5011
5012 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5013 if (level > 0)
5014 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5015 zn->score = score;
5016 zn->obj = obj;
5017 return zn;
5018 }
5019
5020 static zskiplist *zslCreate(void) {
5021 int j;
5022 zskiplist *zsl;
5023
5024 zsl = zmalloc(sizeof(*zsl));
5025 zsl->level = 1;
5026 zsl->length = 0;
5027 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5028 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5029 zsl->header->forward[j] = NULL;
5030
5031 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5032 if (j < ZSKIPLIST_MAXLEVEL-1)
5033 zsl->header->span[j] = 0;
5034 }
5035 zsl->header->backward = NULL;
5036 zsl->tail = NULL;
5037 return zsl;
5038 }
5039
5040 static void zslFreeNode(zskiplistNode *node) {
5041 decrRefCount(node->obj);
5042 zfree(node->forward);
5043 zfree(node->span);
5044 zfree(node);
5045 }
5046
5047 static void zslFree(zskiplist *zsl) {
5048 zskiplistNode *node = zsl->header->forward[0], *next;
5049
5050 zfree(zsl->header->forward);
5051 zfree(zsl->header->span);
5052 zfree(zsl->header);
5053 while(node) {
5054 next = node->forward[0];
5055 zslFreeNode(node);
5056 node = next;
5057 }
5058 zfree(zsl);
5059 }
5060
5061 static int zslRandomLevel(void) {
5062 int level = 1;
5063 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5064 level += 1;
5065 return level;
5066 }
5067
5068 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5069 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5070 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5071 int i, level;
5072
5073 x = zsl->header;
5074 for (i = zsl->level-1; i >= 0; i--) {
5075 /* store rank that is crossed to reach the insert position */
5076 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5077
5078 while (x->forward[i] &&
5079 (x->forward[i]->score < score ||
5080 (x->forward[i]->score == score &&
5081 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5082 rank[i] += i > 0 ? x->span[i-1] : 1;
5083 x = x->forward[i];
5084 }
5085 update[i] = x;
5086 }
5087 /* we assume the key is not already inside, since we allow duplicated
5088 * scores, and the re-insertion of score and redis object should never
5089 * happpen since the caller of zslInsert() should test in the hash table
5090 * if the element is already inside or not. */
5091 level = zslRandomLevel();
5092 if (level > zsl->level) {
5093 for (i = zsl->level; i < level; i++) {
5094 rank[i] = 0;
5095 update[i] = zsl->header;
5096 update[i]->span[i-1] = zsl->length;
5097 }
5098 zsl->level = level;
5099 }
5100 x = zslCreateNode(level,score,obj);
5101 for (i = 0; i < level; i++) {
5102 x->forward[i] = update[i]->forward[i];
5103 update[i]->forward[i] = x;
5104
5105 /* update span covered by update[i] as x is inserted here */
5106 if (i > 0) {
5107 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5108 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5109 }
5110 }
5111
5112 /* increment span for untouched levels */
5113 for (i = level; i < zsl->level; i++) {
5114 update[i]->span[i-1]++;
5115 }
5116
5117 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5118 if (x->forward[0])
5119 x->forward[0]->backward = x;
5120 else
5121 zsl->tail = x;
5122 zsl->length++;
5123 }
5124
5125 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5126 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5127 int i;
5128 for (i = 0; i < zsl->level; i++) {
5129 if (update[i]->forward[i] == x) {
5130 if (i > 0) {
5131 update[i]->span[i-1] += x->span[i-1] - 1;
5132 }
5133 update[i]->forward[i] = x->forward[i];
5134 } else {
5135 /* invariant: i > 0, because update[0]->forward[0]
5136 * is always equal to x */
5137 update[i]->span[i-1] -= 1;
5138 }
5139 }
5140 if (x->forward[0]) {
5141 x->forward[0]->backward = x->backward;
5142 } else {
5143 zsl->tail = x->backward;
5144 }
5145 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5146 zsl->level--;
5147 zsl->length--;
5148 }
5149
5150 /* Delete an element with matching score/object from the skiplist. */
5151 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5152 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5153 int i;
5154
5155 x = zsl->header;
5156 for (i = zsl->level-1; i >= 0; i--) {
5157 while (x->forward[i] &&
5158 (x->forward[i]->score < score ||
5159 (x->forward[i]->score == score &&
5160 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5161 x = x->forward[i];
5162 update[i] = x;
5163 }
5164 /* We may have multiple elements with the same score, what we need
5165 * is to find the element with both the right score and object. */
5166 x = x->forward[0];
5167 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5168 zslDeleteNode(zsl, x, update);
5169 zslFreeNode(x);
5170 return 1;
5171 } else {
5172 return 0; /* not found */
5173 }
5174 return 0; /* not found */
5175 }
5176
5177 /* Delete all the elements with score between min and max from the skiplist.
5178 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5179 * Note that this function takes the reference to the hash table view of the
5180 * sorted set, in order to remove the elements from the hash table too. */
5181 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5182 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5183 unsigned long removed = 0;
5184 int i;
5185
5186 x = zsl->header;
5187 for (i = zsl->level-1; i >= 0; i--) {
5188 while (x->forward[i] && x->forward[i]->score < min)
5189 x = x->forward[i];
5190 update[i] = x;
5191 }
5192 /* We may have multiple elements with the same score, what we need
5193 * is to find the element with both the right score and object. */
5194 x = x->forward[0];
5195 while (x && x->score <= max) {
5196 zskiplistNode *next = x->forward[0];
5197 zslDeleteNode(zsl, x, update);
5198 dictDelete(dict,x->obj);
5199 zslFreeNode(x);
5200 removed++;
5201 x = next;
5202 }
5203 return removed; /* not found */
5204 }
5205
5206 /* Delete all the elements with rank between start and end from the skiplist.
5207 * Start and end are inclusive. Note that start and end need to be 1-based */
5208 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5209 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5210 unsigned long traversed = 0, removed = 0;
5211 int i;
5212
5213 x = zsl->header;
5214 for (i = zsl->level-1; i >= 0; i--) {
5215 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5216 traversed += i > 0 ? x->span[i-1] : 1;
5217 x = x->forward[i];
5218 }
5219 update[i] = x;
5220 }
5221
5222 traversed++;
5223 x = x->forward[0];
5224 while (x && traversed <= end) {
5225 zskiplistNode *next = x->forward[0];
5226 zslDeleteNode(zsl, x, update);
5227 dictDelete(dict,x->obj);
5228 zslFreeNode(x);
5229 removed++;
5230 traversed++;
5231 x = next;
5232 }
5233 return removed;
5234 }
5235
5236 /* Find the first node having a score equal or greater than the specified one.
5237 * Returns NULL if there is no match. */
5238 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5239 zskiplistNode *x;
5240 int i;
5241
5242 x = zsl->header;
5243 for (i = zsl->level-1; i >= 0; i--) {
5244 while (x->forward[i] && x->forward[i]->score < score)
5245 x = x->forward[i];
5246 }
5247 /* We may have multiple elements with the same score, what we need
5248 * is to find the element with both the right score and object. */
5249 return x->forward[0];
5250 }
5251
5252 /* Find the rank for an element by both score and key.
5253 * Returns 0 when the element cannot be found, rank otherwise.
5254 * Note that the rank is 1-based due to the span of zsl->header to the
5255 * first element. */
5256 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5257 zskiplistNode *x;
5258 unsigned long rank = 0;
5259 int i;
5260
5261 x = zsl->header;
5262 for (i = zsl->level-1; i >= 0; i--) {
5263 while (x->forward[i] &&
5264 (x->forward[i]->score < score ||
5265 (x->forward[i]->score == score &&
5266 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5267 rank += i > 0 ? x->span[i-1] : 1;
5268 x = x->forward[i];
5269 }
5270
5271 /* x might be equal to zsl->header, so test if obj is non-NULL */
5272 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5273 return rank;
5274 }
5275 }
5276 return 0;
5277 }
5278
5279 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5280 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5281 zskiplistNode *x;
5282 unsigned long traversed = 0;
5283 int i;
5284
5285 x = zsl->header;
5286 for (i = zsl->level-1; i >= 0; i--) {
5287 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) <= rank) {
5288 traversed += i > 0 ? x->span[i-1] : 1;
5289 x = x->forward[i];
5290 }
5291
5292 if (traversed == rank) {
5293 return x;
5294 }
5295 }
5296 return NULL;
5297 }
5298
5299 /* The actual Z-commands implementations */
5300
5301 /* This generic command implements both ZADD and ZINCRBY.
5302 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5303 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5304 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5305 robj *zsetobj;
5306 zset *zs;
5307 double *score;
5308
5309 zsetobj = lookupKeyWrite(c->db,key);
5310 if (zsetobj == NULL) {
5311 zsetobj = createZsetObject();
5312 dictAdd(c->db->dict,key,zsetobj);
5313 incrRefCount(key);
5314 } else {
5315 if (zsetobj->type != REDIS_ZSET) {
5316 addReply(c,shared.wrongtypeerr);
5317 return;
5318 }
5319 }
5320 zs = zsetobj->ptr;
5321
5322 /* Ok now since we implement both ZADD and ZINCRBY here the code
5323 * needs to handle the two different conditions. It's all about setting
5324 * '*score', that is, the new score to set, to the right value. */
5325 score = zmalloc(sizeof(double));
5326 if (doincrement) {
5327 dictEntry *de;
5328
5329 /* Read the old score. If the element was not present starts from 0 */
5330 de = dictFind(zs->dict,ele);
5331 if (de) {
5332 double *oldscore = dictGetEntryVal(de);
5333 *score = *oldscore + scoreval;
5334 } else {
5335 *score = scoreval;
5336 }
5337 } else {
5338 *score = scoreval;
5339 }
5340
5341 /* What follows is a simple remove and re-insert operation that is common
5342 * to both ZADD and ZINCRBY... */
5343 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5344 /* case 1: New element */
5345 incrRefCount(ele); /* added to hash */
5346 zslInsert(zs->zsl,*score,ele);
5347 incrRefCount(ele); /* added to skiplist */
5348 server.dirty++;
5349 if (doincrement)
5350 addReplyDouble(c,*score);
5351 else
5352 addReply(c,shared.cone);
5353 } else {
5354 dictEntry *de;
5355 double *oldscore;
5356
5357 /* case 2: Score update operation */
5358 de = dictFind(zs->dict,ele);
5359 redisAssert(de != NULL);
5360 oldscore = dictGetEntryVal(de);
5361 if (*score != *oldscore) {
5362 int deleted;
5363
5364 /* Remove and insert the element in the skip list with new score */
5365 deleted = zslDelete(zs->zsl,*oldscore,ele);
5366 redisAssert(deleted != 0);
5367 zslInsert(zs->zsl,*score,ele);
5368 incrRefCount(ele);
5369 /* Update the score in the hash table */
5370 dictReplace(zs->dict,ele,score);
5371 server.dirty++;
5372 } else {
5373 zfree(score);
5374 }
5375 if (doincrement)
5376 addReplyDouble(c,*score);
5377 else
5378 addReply(c,shared.czero);
5379 }
5380 }
5381
5382 static void zaddCommand(redisClient *c) {
5383 double scoreval;
5384
5385 scoreval = strtod(c->argv[2]->ptr,NULL);
5386 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5387 }
5388
5389 static void zincrbyCommand(redisClient *c) {
5390 double scoreval;
5391
5392 scoreval = strtod(c->argv[2]->ptr,NULL);
5393 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5394 }
5395
5396 static void zremCommand(redisClient *c) {
5397 robj *zsetobj;
5398 zset *zs;
5399
5400 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5401 if (zsetobj == NULL) {
5402 addReply(c,shared.czero);
5403 } else {
5404 dictEntry *de;
5405 double *oldscore;
5406 int deleted;
5407
5408 if (zsetobj->type != REDIS_ZSET) {
5409 addReply(c,shared.wrongtypeerr);
5410 return;
5411 }
5412 zs = zsetobj->ptr;
5413 de = dictFind(zs->dict,c->argv[2]);
5414 if (de == NULL) {
5415 addReply(c,shared.czero);
5416 return;
5417 }
5418 /* Delete from the skiplist */
5419 oldscore = dictGetEntryVal(de);
5420 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5421 redisAssert(deleted != 0);
5422
5423 /* Delete from the hash table */
5424 dictDelete(zs->dict,c->argv[2]);
5425 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5426 server.dirty++;
5427 addReply(c,shared.cone);
5428 }
5429 }
5430
5431 static void zremrangebyscoreCommand(redisClient *c) {
5432 double min = strtod(c->argv[2]->ptr,NULL);
5433 double max = strtod(c->argv[3]->ptr,NULL);
5434 robj *zsetobj;
5435 zset *zs;
5436
5437 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5438 if (zsetobj == NULL) {
5439 addReply(c,shared.czero);
5440 } else {
5441 long deleted;
5442
5443 if (zsetobj->type != REDIS_ZSET) {
5444 addReply(c,shared.wrongtypeerr);
5445 return;
5446 }
5447 zs = zsetobj->ptr;
5448 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5449 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5450 server.dirty += deleted;
5451 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5452 }
5453 }
5454
5455 static void zremrangebyrankCommand(redisClient *c) {
5456 int start = atoi(c->argv[2]->ptr);
5457 int end = atoi(c->argv[3]->ptr);
5458 robj *zsetobj;
5459 zset *zs;
5460
5461 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5462 if (zsetobj == NULL) {
5463 addReply(c,shared.czero);
5464 } else {
5465 if (zsetobj->type != REDIS_ZSET) {
5466 addReply(c,shared.wrongtypeerr);
5467 return;
5468 }
5469
5470 zs = zsetobj->ptr;
5471 int llen = zs->zsl->length;
5472 long deleted;
5473
5474 /* convert negative indexes */
5475 if (start < 0) start = llen+start;
5476 if (end < 0) end = llen+end;
5477 if (start < 0) start = 0;
5478 if (end < 0) end = 0;
5479
5480 /* indexes sanity checks */
5481 if (start > end || start >= llen) {
5482 addReply(c,shared.czero);
5483 return;
5484 }
5485 if (end >= llen) end = llen-1;
5486
5487 /* increment start and end because zsl*Rank functions
5488 * use 1-based rank */
5489 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5490 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5491 server.dirty += deleted;
5492 addReplyLong(c, deleted);
5493 }
5494 }
5495
5496 typedef struct {
5497 dict *dict;
5498 double weight;
5499 } zsetopsrc;
5500
5501 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5502 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5503 unsigned long size1, size2;
5504 size1 = d1->dict ? dictSize(d1->dict) : 0;
5505 size2 = d2->dict ? dictSize(d2->dict) : 0;
5506 return size1 - size2;
5507 }
5508
5509 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5510 int i, j, zsetnum;
5511 zsetopsrc *src;
5512 robj *dstobj;
5513 zset *dstzset;
5514 dictIterator *di;
5515 dictEntry *de;
5516
5517 /* expect zsetnum input keys to be given */
5518 zsetnum = atoi(c->argv[2]->ptr);
5519 if (zsetnum < 1) {
5520 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5521 return;
5522 }
5523
5524 /* test if the expected number of keys would overflow */
5525 if (3+zsetnum > c->argc) {
5526 addReply(c,shared.syntaxerr);
5527 return;
5528 }
5529
5530 /* read keys to be used for input */
5531 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5532 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5533 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5534 if (!zsetobj) {
5535 src[i].dict = NULL;
5536 } else {
5537 if (zsetobj->type != REDIS_ZSET) {
5538 zfree(src);
5539 addReply(c,shared.wrongtypeerr);
5540 return;
5541 }
5542 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5543 }
5544
5545 /* default all weights to 1 */
5546 src[i].weight = 1.0;
5547 }
5548
5549 /* parse optional extra arguments */
5550 if (j < c->argc) {
5551 int remaining = c->argc-j;
5552
5553 while (remaining) {
5554 if (!strcasecmp(c->argv[j]->ptr,"weights")) {
5555 j++; remaining--;
5556 if (remaining < zsetnum) {
5557 zfree(src);
5558 addReplySds(c,sdsnew("-ERR not enough weights for ZUNION/ZINTER\r\n"));
5559 return;
5560 }
5561 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5562 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5563 }
5564 } else {
5565 zfree(src);
5566 addReply(c,shared.syntaxerr);
5567 return;
5568 }
5569 }
5570 }
5571
5572 dstobj = createZsetObject();
5573 dstzset = dstobj->ptr;
5574
5575 if (op == REDIS_OP_INTER) {
5576 /* sort sets from the smallest to largest, this will improve our
5577 * algorithm's performance */
5578 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5579
5580 /* skip going over all entries if the smallest zset is NULL or empty */
5581 if (src[0].dict && dictSize(src[0].dict) > 0) {
5582 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5583 * from small to large, all src[i > 0].dict are non-empty too */
5584 di = dictGetIterator(src[0].dict);
5585 while((de = dictNext(di)) != NULL) {
5586 double *score = zmalloc(sizeof(double));
5587 *score = 0.0;
5588
5589 for (j = 0; j < zsetnum; j++) {
5590 dictEntry *other = (j == 0) ? de : dictFind(src[j].dict,dictGetEntryKey(de));
5591 if (other) {
5592 *score = *score + src[j].weight * (*(double*)dictGetEntryVal(other));
5593 } else {
5594 break;
5595 }
5596 }
5597
5598 /* skip entry when not present in every source dict */
5599 if (j != zsetnum) {
5600 zfree(score);
5601 } else {
5602 robj *o = dictGetEntryKey(de);
5603 dictAdd(dstzset->dict,o,score);
5604 incrRefCount(o); /* added to dictionary */
5605 zslInsert(dstzset->zsl,*score,o);
5606 incrRefCount(o); /* added to skiplist */
5607 }
5608 }
5609 dictReleaseIterator(di);
5610 }
5611 } else if (op == REDIS_OP_UNION) {
5612 for (i = 0; i < zsetnum; i++) {
5613 if (!src[i].dict) continue;
5614
5615 di = dictGetIterator(src[i].dict);
5616 while((de = dictNext(di)) != NULL) {
5617 /* skip key when already processed */
5618 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5619
5620 double *score = zmalloc(sizeof(double));
5621 *score = 0.0;
5622 for (j = 0; j < zsetnum; j++) {
5623 if (!src[j].dict) continue;
5624
5625 dictEntry *other = (i == j) ? de : dictFind(src[j].dict,dictGetEntryKey(de));
5626 if (other) {
5627 *score = *score + src[j].weight * (*(double*)dictGetEntryVal(other));
5628 }
5629 }
5630
5631 robj *o = dictGetEntryKey(de);
5632 dictAdd(dstzset->dict,o,score);
5633 incrRefCount(o); /* added to dictionary */
5634 zslInsert(dstzset->zsl,*score,o);
5635 incrRefCount(o); /* added to skiplist */
5636 }
5637 dictReleaseIterator(di);
5638 }
5639 } else {
5640 /* unknown operator */
5641 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5642 }
5643
5644 deleteKey(c->db,dstkey);
5645 dictAdd(c->db->dict,dstkey,dstobj);
5646 incrRefCount(dstkey);
5647
5648 addReplyLong(c, dstzset->zsl->length);
5649 server.dirty++;
5650 zfree(src);
5651 }
5652
5653 static void zunionCommand(redisClient *c) {
5654 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5655 }
5656
5657 static void zinterCommand(redisClient *c) {
5658 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5659 }
5660
5661 static void zrangeGenericCommand(redisClient *c, int reverse) {
5662 robj *o;
5663 int start = atoi(c->argv[2]->ptr);
5664 int end = atoi(c->argv[3]->ptr);
5665 int withscores = 0;
5666
5667 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5668 withscores = 1;
5669 } else if (c->argc >= 5) {
5670 addReply(c,shared.syntaxerr);
5671 return;
5672 }
5673
5674 o = lookupKeyRead(c->db,c->argv[1]);
5675 if (o == NULL) {
5676 addReply(c,shared.nullmultibulk);
5677 } else {
5678 if (o->type != REDIS_ZSET) {
5679 addReply(c,shared.wrongtypeerr);
5680 } else {
5681 zset *zsetobj = o->ptr;
5682 zskiplist *zsl = zsetobj->zsl;
5683 zskiplistNode *ln;
5684
5685 int llen = zsl->length;
5686 int rangelen, j;
5687 robj *ele;
5688
5689 /* convert negative indexes */
5690 if (start < 0) start = llen+start;
5691 if (end < 0) end = llen+end;
5692 if (start < 0) start = 0;
5693 if (end < 0) end = 0;
5694
5695 /* indexes sanity checks */
5696 if (start > end || start >= llen) {
5697 /* Out of range start or start > end result in empty list */
5698 addReply(c,shared.emptymultibulk);
5699 return;
5700 }
5701 if (end >= llen) end = llen-1;
5702 rangelen = (end-start)+1;
5703
5704 /* check if starting point is trivial, before searching
5705 * the element in log(N) time */
5706 if (reverse) {
5707 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5708 } else {
5709 ln = start == 0 ? zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5710 }
5711
5712 /* Return the result in form of a multi-bulk reply */
5713 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5714 withscores ? (rangelen*2) : rangelen));
5715 for (j = 0; j < rangelen; j++) {
5716 ele = ln->obj;
5717 addReplyBulkLen(c,ele);
5718 addReply(c,ele);
5719 addReply(c,shared.crlf);
5720 if (withscores)
5721 addReplyDouble(c,ln->score);
5722 ln = reverse ? ln->backward : ln->forward[0];
5723 }
5724 }
5725 }
5726 }
5727
5728 static void zrangeCommand(redisClient *c) {
5729 zrangeGenericCommand(c,0);
5730 }
5731
5732 static void zrevrangeCommand(redisClient *c) {
5733 zrangeGenericCommand(c,1);
5734 }
5735
5736 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5737 * If justcount is non-zero, just the count is returned. */
5738 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5739 robj *o;
5740 double min, max;
5741 int minex = 0, maxex = 0; /* are min or max exclusive? */
5742 int offset = 0, limit = -1;
5743 int withscores = 0;
5744 int badsyntax = 0;
5745
5746 /* Parse the min-max interval. If one of the values is prefixed
5747 * by the "(" character, it's considered "open". For instance
5748 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5749 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5750 if (((char*)c->argv[2]->ptr)[0] == '(') {
5751 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5752 minex = 1;
5753 } else {
5754 min = strtod(c->argv[2]->ptr,NULL);
5755 }
5756 if (((char*)c->argv[3]->ptr)[0] == '(') {
5757 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5758 maxex = 1;
5759 } else {
5760 max = strtod(c->argv[3]->ptr,NULL);
5761 }
5762
5763 /* Parse "WITHSCORES": note that if the command was called with
5764 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5765 * enter the following paths to parse WITHSCORES and LIMIT. */
5766 if (c->argc == 5 || c->argc == 8) {
5767 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5768 withscores = 1;
5769 else
5770 badsyntax = 1;
5771 }
5772 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5773 badsyntax = 1;
5774 if (badsyntax) {
5775 addReplySds(c,
5776 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5777 return;
5778 }
5779
5780 /* Parse "LIMIT" */
5781 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5782 addReply(c,shared.syntaxerr);
5783 return;
5784 } else if (c->argc == (7 + withscores)) {
5785 offset = atoi(c->argv[5]->ptr);
5786 limit = atoi(c->argv[6]->ptr);
5787 if (offset < 0) offset = 0;
5788 }
5789
5790 /* Ok, lookup the key and get the range */
5791 o = lookupKeyRead(c->db,c->argv[1]);
5792 if (o == NULL) {
5793 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5794 } else {
5795 if (o->type != REDIS_ZSET) {
5796 addReply(c,shared.wrongtypeerr);
5797 } else {
5798 zset *zsetobj = o->ptr;
5799 zskiplist *zsl = zsetobj->zsl;
5800 zskiplistNode *ln;
5801 robj *ele, *lenobj = NULL;
5802 unsigned long rangelen = 0;
5803
5804 /* Get the first node with the score >= min, or with
5805 * score > min if 'minex' is true. */
5806 ln = zslFirstWithScore(zsl,min);
5807 while (minex && ln && ln->score == min) ln = ln->forward[0];
5808
5809 if (ln == NULL) {
5810 /* No element matching the speciifed interval */
5811 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5812 return;
5813 }
5814
5815 /* We don't know in advance how many matching elements there
5816 * are in the list, so we push this object that will represent
5817 * the multi-bulk length in the output buffer, and will "fix"
5818 * it later */
5819 if (!justcount) {
5820 lenobj = createObject(REDIS_STRING,NULL);
5821 addReply(c,lenobj);
5822 decrRefCount(lenobj);
5823 }
5824
5825 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5826 if (offset) {
5827 offset--;
5828 ln = ln->forward[0];
5829 continue;
5830 }
5831 if (limit == 0) break;
5832 if (!justcount) {
5833 ele = ln->obj;
5834 addReplyBulkLen(c,ele);
5835 addReply(c,ele);
5836 addReply(c,shared.crlf);
5837 if (withscores)
5838 addReplyDouble(c,ln->score);
5839 }
5840 ln = ln->forward[0];
5841 rangelen++;
5842 if (limit > 0) limit--;
5843 }
5844 if (justcount) {
5845 addReplyLong(c,(long)rangelen);
5846 } else {
5847 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5848 withscores ? (rangelen*2) : rangelen);
5849 }
5850 }
5851 }
5852 }
5853
5854 static void zrangebyscoreCommand(redisClient *c) {
5855 genericZrangebyscoreCommand(c,0);
5856 }
5857
5858 static void zcountCommand(redisClient *c) {
5859 genericZrangebyscoreCommand(c,1);
5860 }
5861
5862 static void zcardCommand(redisClient *c) {
5863 robj *o;
5864 zset *zs;
5865
5866 o = lookupKeyRead(c->db,c->argv[1]);
5867 if (o == NULL) {
5868 addReply(c,shared.czero);
5869 return;
5870 } else {
5871 if (o->type != REDIS_ZSET) {
5872 addReply(c,shared.wrongtypeerr);
5873 } else {
5874 zs = o->ptr;
5875 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
5876 }
5877 }
5878 }
5879
5880 static void zscoreCommand(redisClient *c) {
5881 robj *o;
5882 zset *zs;
5883
5884 o = lookupKeyRead(c->db,c->argv[1]);
5885 if (o == NULL) {
5886 addReply(c,shared.nullbulk);
5887 return;
5888 } else {
5889 if (o->type != REDIS_ZSET) {
5890 addReply(c,shared.wrongtypeerr);
5891 } else {
5892 dictEntry *de;
5893
5894 zs = o->ptr;
5895 de = dictFind(zs->dict,c->argv[2]);
5896 if (!de) {
5897 addReply(c,shared.nullbulk);
5898 } else {
5899 double *score = dictGetEntryVal(de);
5900
5901 addReplyDouble(c,*score);
5902 }
5903 }
5904 }
5905 }
5906
5907 static void zrankGenericCommand(redisClient *c, int reverse) {
5908 robj *o;
5909 o = lookupKeyRead(c->db,c->argv[1]);
5910 if (o == NULL) {
5911 addReply(c,shared.nullbulk);
5912 return;
5913 }
5914 if (o->type != REDIS_ZSET) {
5915 addReply(c,shared.wrongtypeerr);
5916 } else {
5917 zset *zs = o->ptr;
5918 zskiplist *zsl = zs->zsl;
5919 dictEntry *de;
5920 unsigned long rank;
5921
5922 de = dictFind(zs->dict,c->argv[2]);
5923 if (!de) {
5924 addReply(c,shared.nullbulk);
5925 return;
5926 }
5927
5928 double *score = dictGetEntryVal(de);
5929 rank = zslGetRank(zsl, *score, c->argv[2]);
5930 if (rank) {
5931 if (reverse) {
5932 addReplyLong(c, zsl->length - rank);
5933 } else {
5934 addReplyLong(c, rank-1);
5935 }
5936 } else {
5937 addReply(c,shared.nullbulk);
5938 }
5939 }
5940 }
5941
5942 static void zrankCommand(redisClient *c) {
5943 zrankGenericCommand(c, 0);
5944 }
5945
5946 static void zrevrankCommand(redisClient *c) {
5947 zrankGenericCommand(c, 1);
5948 }
5949
5950 /* =================================== Hashes =============================== */
5951 static void hsetCommand(redisClient *c) {
5952 int update = 0;
5953 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5954
5955 if (o == NULL) {
5956 o = createHashObject();
5957 dictAdd(c->db->dict,c->argv[1],o);
5958 incrRefCount(c->argv[1]);
5959 } else {
5960 if (o->type != REDIS_HASH) {
5961 addReply(c,shared.wrongtypeerr);
5962 return;
5963 }
5964 }
5965 /* We want to convert the zipmap into an hash table right now if the
5966 * entry to be added is too big. Note that we check if the object
5967 * is integer encoded before to try fetching the length in the test below.
5968 * This is because integers are small, but currently stringObjectLen()
5969 * performs a slow conversion: not worth it. */
5970 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5971 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5972 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5973 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5974 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5975 {
5976 convertToRealHash(o);
5977 }
5978
5979 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5980 unsigned char *zm = o->ptr;
5981 robj *valobj = getDecodedObject(c->argv[3]);
5982
5983 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5984 valobj->ptr,sdslen(valobj->ptr),&update);
5985 decrRefCount(valobj);
5986 o->ptr = zm;
5987
5988 /* And here there is the second check for hash conversion...
5989 * we want to do it only if the operation was not just an update as
5990 * zipmapLen() is O(N). */
5991 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
5992 convertToRealHash(o);
5993 } else {
5994 tryObjectEncoding(c->argv[2]);
5995 /* note that c->argv[3] is already encoded, as the latest arg
5996 * of a bulk command is always integer encoded if possible. */
5997 if (dictAdd(o->ptr,c->argv[2],c->argv[3]) == DICT_OK) {
5998 incrRefCount(c->argv[2]);
5999 } else {
6000 update = 1;
6001 }
6002 incrRefCount(c->argv[3]);
6003 }
6004 server.dirty++;
6005 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6006 }
6007
6008 static void hgetCommand(redisClient *c) {
6009 robj *o = lookupKeyRead(c->db,c->argv[1]);
6010
6011 if (o == NULL) {
6012 addReply(c,shared.nullbulk);
6013 return;
6014 } else {
6015 if (o->type != REDIS_HASH) {
6016 addReply(c,shared.wrongtypeerr);
6017 return;
6018 }
6019
6020 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6021 unsigned char *zm = o->ptr;
6022 unsigned char *val;
6023 unsigned int vlen;
6024
6025 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr), &val,&vlen)) {
6026 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6027 addReplySds(c,sdsnewlen(val,vlen));
6028 addReply(c,shared.crlf);
6029 return;
6030 } else {
6031 addReply(c,shared.nullbulk);
6032 return;
6033 }
6034 } else {
6035 struct dictEntry *de;
6036
6037 de = dictFind(o->ptr,c->argv[2]);
6038 if (de == NULL) {
6039 addReply(c,shared.nullbulk);
6040 } else {
6041 robj *e = dictGetEntryVal(de);
6042
6043 addReplyBulkLen(c,e);
6044 addReply(c,e);
6045 addReply(c,shared.crlf);
6046 }
6047 }
6048 }
6049 }
6050
6051 static void hdelCommand(redisClient *c) {
6052 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6053
6054 if (o == NULL) {
6055 addReply(c,shared.czero);
6056 return;
6057 } else {
6058 int deleted = 0;
6059
6060 if (o->type != REDIS_HASH) {
6061 addReply(c,shared.wrongtypeerr);
6062 return;
6063 }
6064
6065 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6066 o->ptr = zipmapDel((unsigned char*) o->ptr,
6067 (unsigned char*) c->argv[2]->ptr,
6068 sdslen(c->argv[2]->ptr), &deleted);
6069 } else {
6070 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6071 }
6072 addReply(c,deleted ? shared.cone : shared.czero);
6073 }
6074 }
6075
6076 static void hlenCommand(redisClient *c) {
6077 robj *o;
6078 unsigned long len;
6079
6080 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6081 checkType(c,o,REDIS_HASH)) return;
6082
6083 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6084 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6085 addReplyUlong(c,len);
6086 }
6087
6088 static void convertToRealHash(robj *o) {
6089 unsigned char *key, *val, *p, *zm = o->ptr;
6090 unsigned int klen, vlen;
6091 dict *dict = dictCreate(&hashDictType,NULL);
6092
6093 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6094 p = zipmapRewind(zm);
6095 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6096 robj *keyobj, *valobj;
6097
6098 keyobj = createStringObject((char*)key,klen);
6099 valobj = createStringObject((char*)val,vlen);
6100 tryObjectEncoding(keyobj);
6101 tryObjectEncoding(valobj);
6102 dictAdd(dict,keyobj,valobj);
6103 }
6104 o->encoding = REDIS_ENCODING_HT;
6105 o->ptr = dict;
6106 zfree(zm);
6107 }
6108
6109 /* ========================= Non type-specific commands ==================== */
6110
6111 static void flushdbCommand(redisClient *c) {
6112 server.dirty += dictSize(c->db->dict);
6113 dictEmpty(c->db->dict);
6114 dictEmpty(c->db->expires);
6115 addReply(c,shared.ok);
6116 }
6117
6118 static void flushallCommand(redisClient *c) {
6119 server.dirty += emptyDb();
6120 addReply(c,shared.ok);
6121 rdbSave(server.dbfilename);
6122 server.dirty++;
6123 }
6124
6125 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6126 redisSortOperation *so = zmalloc(sizeof(*so));
6127 so->type = type;
6128 so->pattern = pattern;
6129 return so;
6130 }
6131
6132 /* Return the value associated to the key with a name obtained
6133 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6134 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6135 char *p;
6136 sds spat, ssub;
6137 robj keyobj;
6138 int prefixlen, sublen, postfixlen;
6139 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6140 struct {
6141 long len;
6142 long free;
6143 char buf[REDIS_SORTKEY_MAX+1];
6144 } keyname;
6145
6146 /* If the pattern is "#" return the substitution object itself in order
6147 * to implement the "SORT ... GET #" feature. */
6148 spat = pattern->ptr;
6149 if (spat[0] == '#' && spat[1] == '\0') {
6150 return subst;
6151 }
6152
6153 /* The substitution object may be specially encoded. If so we create
6154 * a decoded object on the fly. Otherwise getDecodedObject will just
6155 * increment the ref count, that we'll decrement later. */
6156 subst = getDecodedObject(subst);
6157
6158 ssub = subst->ptr;
6159 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6160 p = strchr(spat,'*');
6161 if (!p) {
6162 decrRefCount(subst);
6163 return NULL;
6164 }
6165
6166 prefixlen = p-spat;
6167 sublen = sdslen(ssub);
6168 postfixlen = sdslen(spat)-(prefixlen+1);
6169 memcpy(keyname.buf,spat,prefixlen);
6170 memcpy(keyname.buf+prefixlen,ssub,sublen);
6171 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6172 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6173 keyname.len = prefixlen+sublen+postfixlen;
6174
6175 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6176 decrRefCount(subst);
6177
6178 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6179 return lookupKeyRead(db,&keyobj);
6180 }
6181
6182 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6183 * the additional parameter is not standard but a BSD-specific we have to
6184 * pass sorting parameters via the global 'server' structure */
6185 static int sortCompare(const void *s1, const void *s2) {
6186 const redisSortObject *so1 = s1, *so2 = s2;
6187 int cmp;
6188
6189 if (!server.sort_alpha) {
6190 /* Numeric sorting. Here it's trivial as we precomputed scores */
6191 if (so1->u.score > so2->u.score) {
6192 cmp = 1;
6193 } else if (so1->u.score < so2->u.score) {
6194 cmp = -1;
6195 } else {
6196 cmp = 0;
6197 }
6198 } else {
6199 /* Alphanumeric sorting */
6200 if (server.sort_bypattern) {
6201 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6202 /* At least one compare object is NULL */
6203 if (so1->u.cmpobj == so2->u.cmpobj)
6204 cmp = 0;
6205 else if (so1->u.cmpobj == NULL)
6206 cmp = -1;
6207 else
6208 cmp = 1;
6209 } else {
6210 /* We have both the objects, use strcoll */
6211 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6212 }
6213 } else {
6214 /* Compare elements directly */
6215 robj *dec1, *dec2;
6216
6217 dec1 = getDecodedObject(so1->obj);
6218 dec2 = getDecodedObject(so2->obj);
6219 cmp = strcoll(dec1->ptr,dec2->ptr);
6220 decrRefCount(dec1);
6221 decrRefCount(dec2);
6222 }
6223 }
6224 return server.sort_desc ? -cmp : cmp;
6225 }
6226
6227 /* The SORT command is the most complex command in Redis. Warning: this code
6228 * is optimized for speed and a bit less for readability */
6229 static void sortCommand(redisClient *c) {
6230 list *operations;
6231 int outputlen = 0;
6232 int desc = 0, alpha = 0;
6233 int limit_start = 0, limit_count = -1, start, end;
6234 int j, dontsort = 0, vectorlen;
6235 int getop = 0; /* GET operation counter */
6236 robj *sortval, *sortby = NULL, *storekey = NULL;
6237 redisSortObject *vector; /* Resulting vector to sort */
6238
6239 /* Lookup the key to sort. It must be of the right types */
6240 sortval = lookupKeyRead(c->db,c->argv[1]);
6241 if (sortval == NULL) {
6242 addReply(c,shared.nullmultibulk);
6243 return;
6244 }
6245 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6246 sortval->type != REDIS_ZSET)
6247 {
6248 addReply(c,shared.wrongtypeerr);
6249 return;
6250 }
6251
6252 /* Create a list of operations to perform for every sorted element.
6253 * Operations can be GET/DEL/INCR/DECR */
6254 operations = listCreate();
6255 listSetFreeMethod(operations,zfree);
6256 j = 2;
6257
6258 /* Now we need to protect sortval incrementing its count, in the future
6259 * SORT may have options able to overwrite/delete keys during the sorting
6260 * and the sorted key itself may get destroied */
6261 incrRefCount(sortval);
6262
6263 /* The SORT command has an SQL-alike syntax, parse it */
6264 while(j < c->argc) {
6265 int leftargs = c->argc-j-1;
6266 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6267 desc = 0;
6268 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6269 desc = 1;
6270 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6271 alpha = 1;
6272 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6273 limit_start = atoi(c->argv[j+1]->ptr);
6274 limit_count = atoi(c->argv[j+2]->ptr);
6275 j+=2;
6276 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6277 storekey = c->argv[j+1];
6278 j++;
6279 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6280 sortby = c->argv[j+1];
6281 /* If the BY pattern does not contain '*', i.e. it is constant,
6282 * we don't need to sort nor to lookup the weight keys. */
6283 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6284 j++;
6285 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6286 listAddNodeTail(operations,createSortOperation(
6287 REDIS_SORT_GET,c->argv[j+1]));
6288 getop++;
6289 j++;
6290 } else {
6291 decrRefCount(sortval);
6292 listRelease(operations);
6293 addReply(c,shared.syntaxerr);
6294 return;
6295 }
6296 j++;
6297 }
6298
6299 /* Load the sorting vector with all the objects to sort */
6300 switch(sortval->type) {
6301 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6302 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6303 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6304 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6305 }
6306 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6307 j = 0;
6308
6309 if (sortval->type == REDIS_LIST) {
6310 list *list = sortval->ptr;
6311 listNode *ln;
6312 listIter li;
6313
6314 listRewind(list,&li);
6315 while((ln = listNext(&li))) {
6316 robj *ele = ln->value;
6317 vector[j].obj = ele;
6318 vector[j].u.score = 0;
6319 vector[j].u.cmpobj = NULL;
6320 j++;
6321 }
6322 } else {
6323 dict *set;
6324 dictIterator *di;
6325 dictEntry *setele;
6326
6327 if (sortval->type == REDIS_SET) {
6328 set = sortval->ptr;
6329 } else {
6330 zset *zs = sortval->ptr;
6331 set = zs->dict;
6332 }
6333
6334 di = dictGetIterator(set);
6335 while((setele = dictNext(di)) != NULL) {
6336 vector[j].obj = dictGetEntryKey(setele);
6337 vector[j].u.score = 0;
6338 vector[j].u.cmpobj = NULL;
6339 j++;
6340 }
6341 dictReleaseIterator(di);
6342 }
6343 redisAssert(j == vectorlen);
6344
6345 /* Now it's time to load the right scores in the sorting vector */
6346 if (dontsort == 0) {
6347 for (j = 0; j < vectorlen; j++) {
6348 if (sortby) {
6349 robj *byval;
6350
6351 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6352 if (!byval || byval->type != REDIS_STRING) continue;
6353 if (alpha) {
6354 vector[j].u.cmpobj = getDecodedObject(byval);
6355 } else {
6356 if (byval->encoding == REDIS_ENCODING_RAW) {
6357 vector[j].u.score = strtod(byval->ptr,NULL);
6358 } else {
6359 /* Don't need to decode the object if it's
6360 * integer-encoded (the only encoding supported) so
6361 * far. We can just cast it */
6362 if (byval->encoding == REDIS_ENCODING_INT) {
6363 vector[j].u.score = (long)byval->ptr;
6364 } else
6365 redisAssert(1 != 1);
6366 }
6367 }
6368 } else {
6369 if (!alpha) {
6370 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6371 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6372 else {
6373 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6374 vector[j].u.score = (long) vector[j].obj->ptr;
6375 else
6376 redisAssert(1 != 1);
6377 }
6378 }
6379 }
6380 }
6381 }
6382
6383 /* We are ready to sort the vector... perform a bit of sanity check
6384 * on the LIMIT option too. We'll use a partial version of quicksort. */
6385 start = (limit_start < 0) ? 0 : limit_start;
6386 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6387 if (start >= vectorlen) {
6388 start = vectorlen-1;
6389 end = vectorlen-2;
6390 }
6391 if (end >= vectorlen) end = vectorlen-1;
6392
6393 if (dontsort == 0) {
6394 server.sort_desc = desc;
6395 server.sort_alpha = alpha;
6396 server.sort_bypattern = sortby ? 1 : 0;
6397 if (sortby && (start != 0 || end != vectorlen-1))
6398 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6399 else
6400 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6401 }
6402
6403 /* Send command output to the output buffer, performing the specified
6404 * GET/DEL/INCR/DECR operations if any. */
6405 outputlen = getop ? getop*(end-start+1) : end-start+1;
6406 if (storekey == NULL) {
6407 /* STORE option not specified, sent the sorting result to client */
6408 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6409 for (j = start; j <= end; j++) {
6410 listNode *ln;
6411 listIter li;
6412
6413 if (!getop) {
6414 addReplyBulkLen(c,vector[j].obj);
6415 addReply(c,vector[j].obj);
6416 addReply(c,shared.crlf);
6417 }
6418 listRewind(operations,&li);
6419 while((ln = listNext(&li))) {
6420 redisSortOperation *sop = ln->value;
6421 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6422 vector[j].obj);
6423
6424 if (sop->type == REDIS_SORT_GET) {
6425 if (!val || val->type != REDIS_STRING) {
6426 addReply(c,shared.nullbulk);
6427 } else {
6428 addReplyBulkLen(c,val);
6429 addReply(c,val);
6430 addReply(c,shared.crlf);
6431 }
6432 } else {
6433 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6434 }
6435 }
6436 }
6437 } else {
6438 robj *listObject = createListObject();
6439 list *listPtr = (list*) listObject->ptr;
6440
6441 /* STORE option specified, set the sorting result as a List object */
6442 for (j = start; j <= end; j++) {
6443 listNode *ln;
6444 listIter li;
6445
6446 if (!getop) {
6447 listAddNodeTail(listPtr,vector[j].obj);
6448 incrRefCount(vector[j].obj);
6449 }
6450 listRewind(operations,&li);
6451 while((ln = listNext(&li))) {
6452 redisSortOperation *sop = ln->value;
6453 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6454 vector[j].obj);
6455
6456 if (sop->type == REDIS_SORT_GET) {
6457 if (!val || val->type != REDIS_STRING) {
6458 listAddNodeTail(listPtr,createStringObject("",0));
6459 } else {
6460 listAddNodeTail(listPtr,val);
6461 incrRefCount(val);
6462 }
6463 } else {
6464 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6465 }
6466 }
6467 }
6468 if (dictReplace(c->db->dict,storekey,listObject)) {
6469 incrRefCount(storekey);
6470 }
6471 /* Note: we add 1 because the DB is dirty anyway since even if the
6472 * SORT result is empty a new key is set and maybe the old content
6473 * replaced. */
6474 server.dirty += 1+outputlen;
6475 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6476 }
6477
6478 /* Cleanup */
6479 decrRefCount(sortval);
6480 listRelease(operations);
6481 for (j = 0; j < vectorlen; j++) {
6482 if (sortby && alpha && vector[j].u.cmpobj)
6483 decrRefCount(vector[j].u.cmpobj);
6484 }
6485 zfree(vector);
6486 }
6487
6488 /* Convert an amount of bytes into a human readable string in the form
6489 * of 100B, 2G, 100M, 4K, and so forth. */
6490 static void bytesToHuman(char *s, unsigned long long n) {
6491 double d;
6492
6493 if (n < 1024) {
6494 /* Bytes */
6495 sprintf(s,"%lluB",n);
6496 return;
6497 } else if (n < (1024*1024)) {
6498 d = (double)n/(1024);
6499 sprintf(s,"%.2fK",d);
6500 } else if (n < (1024LL*1024*1024)) {
6501 d = (double)n/(1024*1024);
6502 sprintf(s,"%.2fM",d);
6503 } else if (n < (1024LL*1024*1024*1024)) {
6504 d = (double)n/(1024LL*1024*1024);
6505 sprintf(s,"%.2fG",d);
6506 }
6507 }
6508
6509 /* Create the string returned by the INFO command. This is decoupled
6510 * by the INFO command itself as we need to report the same information
6511 * on memory corruption problems. */
6512 static sds genRedisInfoString(void) {
6513 sds info;
6514 time_t uptime = time(NULL)-server.stat_starttime;
6515 int j;
6516 char hmem[64];
6517
6518 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
6519 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
6520
6521 bytesToHuman(hmem,zmalloc_used_memory());
6522 info = sdscatprintf(sdsempty(),
6523 "redis_version:%s\r\n"
6524 "arch_bits:%s\r\n"
6525 "multiplexing_api:%s\r\n"
6526 "process_id:%ld\r\n"
6527 "uptime_in_seconds:%ld\r\n"
6528 "uptime_in_days:%ld\r\n"
6529 "connected_clients:%d\r\n"
6530 "connected_slaves:%d\r\n"
6531 "blocked_clients:%d\r\n"
6532 "used_memory:%zu\r\n"
6533 "used_memory_human:%s\r\n"
6534 "changes_since_last_save:%lld\r\n"
6535 "bgsave_in_progress:%d\r\n"
6536 "last_save_time:%ld\r\n"
6537 "bgrewriteaof_in_progress:%d\r\n"
6538 "total_connections_received:%lld\r\n"
6539 "total_commands_processed:%lld\r\n"
6540 "hash_max_zipmap_entries:%ld\r\n"
6541 "hash_max_zipmap_value:%ld\r\n"
6542 "vm_enabled:%d\r\n"
6543 "role:%s\r\n"
6544 ,REDIS_VERSION,
6545 (sizeof(long) == 8) ? "64" : "32",
6546 aeGetApiName(),
6547 (long) getpid(),
6548 uptime,
6549 uptime/(3600*24),
6550 listLength(server.clients)-listLength(server.slaves),
6551 listLength(server.slaves),
6552 server.blpop_blocked_clients,
6553 zmalloc_used_memory(),
6554 hmem,
6555 server.dirty,
6556 server.bgsavechildpid != -1,
6557 server.lastsave,
6558 server.bgrewritechildpid != -1,
6559 server.stat_numconnections,
6560 server.stat_numcommands,
6561 server.hash_max_zipmap_entries,
6562 server.hash_max_zipmap_value,
6563 server.vm_enabled != 0,
6564 server.masterhost == NULL ? "master" : "slave"
6565 );
6566 if (server.masterhost) {
6567 info = sdscatprintf(info,
6568 "master_host:%s\r\n"
6569 "master_port:%d\r\n"
6570 "master_link_status:%s\r\n"
6571 "master_last_io_seconds_ago:%d\r\n"
6572 ,server.masterhost,
6573 server.masterport,
6574 (server.replstate == REDIS_REPL_CONNECTED) ?
6575 "up" : "down",
6576 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6577 );
6578 }
6579 if (server.vm_enabled) {
6580 lockThreadedIO();
6581 info = sdscatprintf(info,
6582 "vm_conf_max_memory:%llu\r\n"
6583 "vm_conf_page_size:%llu\r\n"
6584 "vm_conf_pages:%llu\r\n"
6585 "vm_stats_used_pages:%llu\r\n"
6586 "vm_stats_swapped_objects:%llu\r\n"
6587 "vm_stats_swappin_count:%llu\r\n"
6588 "vm_stats_swappout_count:%llu\r\n"
6589 "vm_stats_io_newjobs_len:%lu\r\n"
6590 "vm_stats_io_processing_len:%lu\r\n"
6591 "vm_stats_io_processed_len:%lu\r\n"
6592 "vm_stats_io_active_threads:%lu\r\n"
6593 "vm_stats_blocked_clients:%lu\r\n"
6594 ,(unsigned long long) server.vm_max_memory,
6595 (unsigned long long) server.vm_page_size,
6596 (unsigned long long) server.vm_pages,
6597 (unsigned long long) server.vm_stats_used_pages,
6598 (unsigned long long) server.vm_stats_swapped_objects,
6599 (unsigned long long) server.vm_stats_swapins,
6600 (unsigned long long) server.vm_stats_swapouts,
6601 (unsigned long) listLength(server.io_newjobs),
6602 (unsigned long) listLength(server.io_processing),
6603 (unsigned long) listLength(server.io_processed),
6604 (unsigned long) server.io_active_threads,
6605 (unsigned long) server.vm_blocked_clients
6606 );
6607 unlockThreadedIO();
6608 }
6609 for (j = 0; j < server.dbnum; j++) {
6610 long long keys, vkeys;
6611
6612 keys = dictSize(server.db[j].dict);
6613 vkeys = dictSize(server.db[j].expires);
6614 if (keys || vkeys) {
6615 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6616 j, keys, vkeys);
6617 }
6618 }
6619 return info;
6620 }
6621
6622 static void infoCommand(redisClient *c) {
6623 sds info = genRedisInfoString();
6624 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6625 (unsigned long)sdslen(info)));
6626 addReplySds(c,info);
6627 addReply(c,shared.crlf);
6628 }
6629
6630 static void monitorCommand(redisClient *c) {
6631 /* ignore MONITOR if aleady slave or in monitor mode */
6632 if (c->flags & REDIS_SLAVE) return;
6633
6634 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6635 c->slaveseldb = 0;
6636 listAddNodeTail(server.monitors,c);
6637 addReply(c,shared.ok);
6638 }
6639
6640 /* ================================= Expire ================================= */
6641 static int removeExpire(redisDb *db, robj *key) {
6642 if (dictDelete(db->expires,key) == DICT_OK) {
6643 return 1;
6644 } else {
6645 return 0;
6646 }
6647 }
6648
6649 static int setExpire(redisDb *db, robj *key, time_t when) {
6650 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6651 return 0;
6652 } else {
6653 incrRefCount(key);
6654 return 1;
6655 }
6656 }
6657
6658 /* Return the expire time of the specified key, or -1 if no expire
6659 * is associated with this key (i.e. the key is non volatile) */
6660 static time_t getExpire(redisDb *db, robj *key) {
6661 dictEntry *de;
6662
6663 /* No expire? return ASAP */
6664 if (dictSize(db->expires) == 0 ||
6665 (de = dictFind(db->expires,key)) == NULL) return -1;
6666
6667 return (time_t) dictGetEntryVal(de);
6668 }
6669
6670 static int expireIfNeeded(redisDb *db, robj *key) {
6671 time_t when;
6672 dictEntry *de;
6673
6674 /* No expire? return ASAP */
6675 if (dictSize(db->expires) == 0 ||
6676 (de = dictFind(db->expires,key)) == NULL) return 0;
6677
6678 /* Lookup the expire */
6679 when = (time_t) dictGetEntryVal(de);
6680 if (time(NULL) <= when) return 0;
6681
6682 /* Delete the key */
6683 dictDelete(db->expires,key);
6684 return dictDelete(db->dict,key) == DICT_OK;
6685 }
6686
6687 static int deleteIfVolatile(redisDb *db, robj *key) {
6688 dictEntry *de;
6689
6690 /* No expire? return ASAP */
6691 if (dictSize(db->expires) == 0 ||
6692 (de = dictFind(db->expires,key)) == NULL) return 0;
6693
6694 /* Delete the key */
6695 server.dirty++;
6696 dictDelete(db->expires,key);
6697 return dictDelete(db->dict,key) == DICT_OK;
6698 }
6699
6700 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6701 dictEntry *de;
6702
6703 de = dictFind(c->db->dict,key);
6704 if (de == NULL) {
6705 addReply(c,shared.czero);
6706 return;
6707 }
6708 if (seconds < 0) {
6709 if (deleteKey(c->db,key)) server.dirty++;
6710 addReply(c, shared.cone);
6711 return;
6712 } else {
6713 time_t when = time(NULL)+seconds;
6714 if (setExpire(c->db,key,when)) {
6715 addReply(c,shared.cone);
6716 server.dirty++;
6717 } else {
6718 addReply(c,shared.czero);
6719 }
6720 return;
6721 }
6722 }
6723
6724 static void expireCommand(redisClient *c) {
6725 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6726 }
6727
6728 static void expireatCommand(redisClient *c) {
6729 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6730 }
6731
6732 static void ttlCommand(redisClient *c) {
6733 time_t expire;
6734 int ttl = -1;
6735
6736 expire = getExpire(c->db,c->argv[1]);
6737 if (expire != -1) {
6738 ttl = (int) (expire-time(NULL));
6739 if (ttl < 0) ttl = -1;
6740 }
6741 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6742 }
6743
6744 /* ================================ MULTI/EXEC ============================== */
6745
6746 /* Client state initialization for MULTI/EXEC */
6747 static void initClientMultiState(redisClient *c) {
6748 c->mstate.commands = NULL;
6749 c->mstate.count = 0;
6750 }
6751
6752 /* Release all the resources associated with MULTI/EXEC state */
6753 static void freeClientMultiState(redisClient *c) {
6754 int j;
6755
6756 for (j = 0; j < c->mstate.count; j++) {
6757 int i;
6758 multiCmd *mc = c->mstate.commands+j;
6759
6760 for (i = 0; i < mc->argc; i++)
6761 decrRefCount(mc->argv[i]);
6762 zfree(mc->argv);
6763 }
6764 zfree(c->mstate.commands);
6765 }
6766
6767 /* Add a new command into the MULTI commands queue */
6768 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6769 multiCmd *mc;
6770 int j;
6771
6772 c->mstate.commands = zrealloc(c->mstate.commands,
6773 sizeof(multiCmd)*(c->mstate.count+1));
6774 mc = c->mstate.commands+c->mstate.count;
6775 mc->cmd = cmd;
6776 mc->argc = c->argc;
6777 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6778 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6779 for (j = 0; j < c->argc; j++)
6780 incrRefCount(mc->argv[j]);
6781 c->mstate.count++;
6782 }
6783
6784 static void multiCommand(redisClient *c) {
6785 c->flags |= REDIS_MULTI;
6786 addReply(c,shared.ok);
6787 }
6788
6789 static void discardCommand(redisClient *c) {
6790 if (!(c->flags & REDIS_MULTI)) {
6791 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6792 return;
6793 }
6794
6795 freeClientMultiState(c);
6796 initClientMultiState(c);
6797 c->flags &= (~REDIS_MULTI);
6798 addReply(c,shared.ok);
6799 }
6800
6801 static void execCommand(redisClient *c) {
6802 int j;
6803 robj **orig_argv;
6804 int orig_argc;
6805
6806 if (!(c->flags & REDIS_MULTI)) {
6807 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6808 return;
6809 }
6810
6811 orig_argv = c->argv;
6812 orig_argc = c->argc;
6813 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6814 for (j = 0; j < c->mstate.count; j++) {
6815 c->argc = c->mstate.commands[j].argc;
6816 c->argv = c->mstate.commands[j].argv;
6817 call(c,c->mstate.commands[j].cmd);
6818 }
6819 c->argv = orig_argv;
6820 c->argc = orig_argc;
6821 freeClientMultiState(c);
6822 initClientMultiState(c);
6823 c->flags &= (~REDIS_MULTI);
6824 }
6825
6826 /* =========================== Blocking Operations ========================= */
6827
6828 /* Currently Redis blocking operations support is limited to list POP ops,
6829 * so the current implementation is not fully generic, but it is also not
6830 * completely specific so it will not require a rewrite to support new
6831 * kind of blocking operations in the future.
6832 *
6833 * Still it's important to note that list blocking operations can be already
6834 * used as a notification mechanism in order to implement other blocking
6835 * operations at application level, so there must be a very strong evidence
6836 * of usefulness and generality before new blocking operations are implemented.
6837 *
6838 * This is how the current blocking POP works, we use BLPOP as example:
6839 * - If the user calls BLPOP and the key exists and contains a non empty list
6840 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6841 * if there is not to block.
6842 * - If instead BLPOP is called and the key does not exists or the list is
6843 * empty we need to block. In order to do so we remove the notification for
6844 * new data to read in the client socket (so that we'll not serve new
6845 * requests if the blocking request is not served). Also we put the client
6846 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6847 * blocking for this keys.
6848 * - If a PUSH operation against a key with blocked clients waiting is
6849 * performed, we serve the first in the list: basically instead to push
6850 * the new element inside the list we return it to the (first / oldest)
6851 * blocking client, unblock the client, and remove it form the list.
6852 *
6853 * The above comment and the source code should be enough in order to understand
6854 * the implementation and modify / fix it later.
6855 */
6856
6857 /* Set a client in blocking mode for the specified key, with the specified
6858 * timeout */
6859 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6860 dictEntry *de;
6861 list *l;
6862 int j;
6863
6864 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6865 c->blockingkeysnum = numkeys;
6866 c->blockingto = timeout;
6867 for (j = 0; j < numkeys; j++) {
6868 /* Add the key in the client structure, to map clients -> keys */
6869 c->blockingkeys[j] = keys[j];
6870 incrRefCount(keys[j]);
6871
6872 /* And in the other "side", to map keys -> clients */
6873 de = dictFind(c->db->blockingkeys,keys[j]);
6874 if (de == NULL) {
6875 int retval;
6876
6877 /* For every key we take a list of clients blocked for it */
6878 l = listCreate();
6879 retval = dictAdd(c->db->blockingkeys,keys[j],l);
6880 incrRefCount(keys[j]);
6881 assert(retval == DICT_OK);
6882 } else {
6883 l = dictGetEntryVal(de);
6884 }
6885 listAddNodeTail(l,c);
6886 }
6887 /* Mark the client as a blocked client */
6888 c->flags |= REDIS_BLOCKED;
6889 server.blpop_blocked_clients++;
6890 }
6891
6892 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
6893 static void unblockClientWaitingData(redisClient *c) {
6894 dictEntry *de;
6895 list *l;
6896 int j;
6897
6898 assert(c->blockingkeys != NULL);
6899 /* The client may wait for multiple keys, so unblock it for every key. */
6900 for (j = 0; j < c->blockingkeysnum; j++) {
6901 /* Remove this client from the list of clients waiting for this key. */
6902 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6903 assert(de != NULL);
6904 l = dictGetEntryVal(de);
6905 listDelNode(l,listSearchKey(l,c));
6906 /* If the list is empty we need to remove it to avoid wasting memory */
6907 if (listLength(l) == 0)
6908 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6909 decrRefCount(c->blockingkeys[j]);
6910 }
6911 /* Cleanup the client structure */
6912 zfree(c->blockingkeys);
6913 c->blockingkeys = NULL;
6914 c->flags &= (~REDIS_BLOCKED);
6915 server.blpop_blocked_clients--;
6916 /* We want to process data if there is some command waiting
6917 * in the input buffer. Note that this is safe even if
6918 * unblockClientWaitingData() gets called from freeClient() because
6919 * freeClient() will be smart enough to call this function
6920 * *after* c->querybuf was set to NULL. */
6921 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6922 }
6923
6924 /* This should be called from any function PUSHing into lists.
6925 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6926 * 'ele' is the element pushed.
6927 *
6928 * If the function returns 0 there was no client waiting for a list push
6929 * against this key.
6930 *
6931 * If the function returns 1 there was a client waiting for a list push
6932 * against this key, the element was passed to this client thus it's not
6933 * needed to actually add it to the list and the caller should return asap. */
6934 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6935 struct dictEntry *de;
6936 redisClient *receiver;
6937 list *l;
6938 listNode *ln;
6939
6940 de = dictFind(c->db->blockingkeys,key);
6941 if (de == NULL) return 0;
6942 l = dictGetEntryVal(de);
6943 ln = listFirst(l);
6944 assert(ln != NULL);
6945 receiver = ln->value;
6946
6947 addReplySds(receiver,sdsnew("*2\r\n"));
6948 addReplyBulkLen(receiver,key);
6949 addReply(receiver,key);
6950 addReply(receiver,shared.crlf);
6951 addReplyBulkLen(receiver,ele);
6952 addReply(receiver,ele);
6953 addReply(receiver,shared.crlf);
6954 unblockClientWaitingData(receiver);
6955 return 1;
6956 }
6957
6958 /* Blocking RPOP/LPOP */
6959 static void blockingPopGenericCommand(redisClient *c, int where) {
6960 robj *o;
6961 time_t timeout;
6962 int j;
6963
6964 for (j = 1; j < c->argc-1; j++) {
6965 o = lookupKeyWrite(c->db,c->argv[j]);
6966 if (o != NULL) {
6967 if (o->type != REDIS_LIST) {
6968 addReply(c,shared.wrongtypeerr);
6969 return;
6970 } else {
6971 list *list = o->ptr;
6972 if (listLength(list) != 0) {
6973 /* If the list contains elements fall back to the usual
6974 * non-blocking POP operation */
6975 robj *argv[2], **orig_argv;
6976 int orig_argc;
6977
6978 /* We need to alter the command arguments before to call
6979 * popGenericCommand() as the command takes a single key. */
6980 orig_argv = c->argv;
6981 orig_argc = c->argc;
6982 argv[1] = c->argv[j];
6983 c->argv = argv;
6984 c->argc = 2;
6985
6986 /* Also the return value is different, we need to output
6987 * the multi bulk reply header and the key name. The
6988 * "real" command will add the last element (the value)
6989 * for us. If this souds like an hack to you it's just
6990 * because it is... */
6991 addReplySds(c,sdsnew("*2\r\n"));
6992 addReplyBulkLen(c,argv[1]);
6993 addReply(c,argv[1]);
6994 addReply(c,shared.crlf);
6995 popGenericCommand(c,where);
6996
6997 /* Fix the client structure with the original stuff */
6998 c->argv = orig_argv;
6999 c->argc = orig_argc;
7000 return;
7001 }
7002 }
7003 }
7004 }
7005 /* If the list is empty or the key does not exists we must block */
7006 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7007 if (timeout > 0) timeout += time(NULL);
7008 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7009 }
7010
7011 static void blpopCommand(redisClient *c) {
7012 blockingPopGenericCommand(c,REDIS_HEAD);
7013 }
7014
7015 static void brpopCommand(redisClient *c) {
7016 blockingPopGenericCommand(c,REDIS_TAIL);
7017 }
7018
7019 /* =============================== Replication ============================= */
7020
7021 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7022 ssize_t nwritten, ret = size;
7023 time_t start = time(NULL);
7024
7025 timeout++;
7026 while(size) {
7027 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7028 nwritten = write(fd,ptr,size);
7029 if (nwritten == -1) return -1;
7030 ptr += nwritten;
7031 size -= nwritten;
7032 }
7033 if ((time(NULL)-start) > timeout) {
7034 errno = ETIMEDOUT;
7035 return -1;
7036 }
7037 }
7038 return ret;
7039 }
7040
7041 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7042 ssize_t nread, totread = 0;
7043 time_t start = time(NULL);
7044
7045 timeout++;
7046 while(size) {
7047 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7048 nread = read(fd,ptr,size);
7049 if (nread == -1) return -1;
7050 ptr += nread;
7051 size -= nread;
7052 totread += nread;
7053 }
7054 if ((time(NULL)-start) > timeout) {
7055 errno = ETIMEDOUT;
7056 return -1;
7057 }
7058 }
7059 return totread;
7060 }
7061
7062 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7063 ssize_t nread = 0;
7064
7065 size--;
7066 while(size) {
7067 char c;
7068
7069 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7070 if (c == '\n') {
7071 *ptr = '\0';
7072 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7073 return nread;
7074 } else {
7075 *ptr++ = c;
7076 *ptr = '\0';
7077 nread++;
7078 }
7079 }
7080 return nread;
7081 }
7082
7083 static void syncCommand(redisClient *c) {
7084 /* ignore SYNC if aleady slave or in monitor mode */
7085 if (c->flags & REDIS_SLAVE) return;
7086
7087 /* SYNC can't be issued when the server has pending data to send to
7088 * the client about already issued commands. We need a fresh reply
7089 * buffer registering the differences between the BGSAVE and the current
7090 * dataset, so that we can copy to other slaves if needed. */
7091 if (listLength(c->reply) != 0) {
7092 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7093 return;
7094 }
7095
7096 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7097 /* Here we need to check if there is a background saving operation
7098 * in progress, or if it is required to start one */
7099 if (server.bgsavechildpid != -1) {
7100 /* Ok a background save is in progress. Let's check if it is a good
7101 * one for replication, i.e. if there is another slave that is
7102 * registering differences since the server forked to save */
7103 redisClient *slave;
7104 listNode *ln;
7105 listIter li;
7106
7107 listRewind(server.slaves,&li);
7108 while((ln = listNext(&li))) {
7109 slave = ln->value;
7110 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7111 }
7112 if (ln) {
7113 /* Perfect, the server is already registering differences for
7114 * another slave. Set the right state, and copy the buffer. */
7115 listRelease(c->reply);
7116 c->reply = listDup(slave->reply);
7117 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7118 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7119 } else {
7120 /* No way, we need to wait for the next BGSAVE in order to
7121 * register differences */
7122 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7123 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7124 }
7125 } else {
7126 /* Ok we don't have a BGSAVE in progress, let's start one */
7127 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7128 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7129 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7130 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7131 return;
7132 }
7133 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7134 }
7135 c->repldbfd = -1;
7136 c->flags |= REDIS_SLAVE;
7137 c->slaveseldb = 0;
7138 listAddNodeTail(server.slaves,c);
7139 return;
7140 }
7141
7142 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7143 redisClient *slave = privdata;
7144 REDIS_NOTUSED(el);
7145 REDIS_NOTUSED(mask);
7146 char buf[REDIS_IOBUF_LEN];
7147 ssize_t nwritten, buflen;
7148
7149 if (slave->repldboff == 0) {
7150 /* Write the bulk write count before to transfer the DB. In theory here
7151 * we don't know how much room there is in the output buffer of the
7152 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7153 * operations) will never be smaller than the few bytes we need. */
7154 sds bulkcount;
7155
7156 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7157 slave->repldbsize);
7158 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7159 {
7160 sdsfree(bulkcount);
7161 freeClient(slave);
7162 return;
7163 }
7164 sdsfree(bulkcount);
7165 }
7166 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7167 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7168 if (buflen <= 0) {
7169 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7170 (buflen == 0) ? "premature EOF" : strerror(errno));
7171 freeClient(slave);
7172 return;
7173 }
7174 if ((nwritten = write(fd,buf,buflen)) == -1) {
7175 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7176 strerror(errno));
7177 freeClient(slave);
7178 return;
7179 }
7180 slave->repldboff += nwritten;
7181 if (slave->repldboff == slave->repldbsize) {
7182 close(slave->repldbfd);
7183 slave->repldbfd = -1;
7184 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7185 slave->replstate = REDIS_REPL_ONLINE;
7186 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7187 sendReplyToClient, slave) == AE_ERR) {
7188 freeClient(slave);
7189 return;
7190 }
7191 addReplySds(slave,sdsempty());
7192 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7193 }
7194 }
7195
7196 /* This function is called at the end of every backgrond saving.
7197 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7198 * otherwise REDIS_ERR is passed to the function.
7199 *
7200 * The goal of this function is to handle slaves waiting for a successful
7201 * background saving in order to perform non-blocking synchronization. */
7202 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7203 listNode *ln;
7204 int startbgsave = 0;
7205 listIter li;
7206
7207 listRewind(server.slaves,&li);
7208 while((ln = listNext(&li))) {
7209 redisClient *slave = ln->value;
7210
7211 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7212 startbgsave = 1;
7213 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7214 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7215 struct redis_stat buf;
7216
7217 if (bgsaveerr != REDIS_OK) {
7218 freeClient(slave);
7219 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7220 continue;
7221 }
7222 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7223 redis_fstat(slave->repldbfd,&buf) == -1) {
7224 freeClient(slave);
7225 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7226 continue;
7227 }
7228 slave->repldboff = 0;
7229 slave->repldbsize = buf.st_size;
7230 slave->replstate = REDIS_REPL_SEND_BULK;
7231 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7232 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7233 freeClient(slave);
7234 continue;
7235 }
7236 }
7237 }
7238 if (startbgsave) {
7239 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7240 listIter li;
7241
7242 listRewind(server.slaves,&li);
7243 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7244 while((ln = listNext(&li))) {
7245 redisClient *slave = ln->value;
7246
7247 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7248 freeClient(slave);
7249 }
7250 }
7251 }
7252 }
7253
7254 static int syncWithMaster(void) {
7255 char buf[1024], tmpfile[256], authcmd[1024];
7256 long dumpsize;
7257 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7258 int dfd, maxtries = 5;
7259
7260 if (fd == -1) {
7261 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7262 strerror(errno));
7263 return REDIS_ERR;
7264 }
7265
7266 /* AUTH with the master if required. */
7267 if(server.masterauth) {
7268 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7269 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7270 close(fd);
7271 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7272 strerror(errno));
7273 return REDIS_ERR;
7274 }
7275 /* Read the AUTH result. */
7276 if (syncReadLine(fd,buf,1024,3600) == -1) {
7277 close(fd);
7278 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7279 strerror(errno));
7280 return REDIS_ERR;
7281 }
7282 if (buf[0] != '+') {
7283 close(fd);
7284 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7285 return REDIS_ERR;
7286 }
7287 }
7288
7289 /* Issue the SYNC command */
7290 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7291 close(fd);
7292 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7293 strerror(errno));
7294 return REDIS_ERR;
7295 }
7296 /* Read the bulk write count */
7297 if (syncReadLine(fd,buf,1024,3600) == -1) {
7298 close(fd);
7299 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7300 strerror(errno));
7301 return REDIS_ERR;
7302 }
7303 if (buf[0] != '$') {
7304 close(fd);
7305 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7306 return REDIS_ERR;
7307 }
7308 dumpsize = strtol(buf+1,NULL,10);
7309 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7310 /* Read the bulk write data on a temp file */
7311 while(maxtries--) {
7312 snprintf(tmpfile,256,
7313 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7314 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7315 if (dfd != -1) break;
7316 sleep(1);
7317 }
7318 if (dfd == -1) {
7319 close(fd);
7320 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7321 return REDIS_ERR;
7322 }
7323 while(dumpsize) {
7324 int nread, nwritten;
7325
7326 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7327 if (nread == -1) {
7328 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7329 strerror(errno));
7330 close(fd);
7331 close(dfd);
7332 return REDIS_ERR;
7333 }
7334 nwritten = write(dfd,buf,nread);
7335 if (nwritten == -1) {
7336 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7337 close(fd);
7338 close(dfd);
7339 return REDIS_ERR;
7340 }
7341 dumpsize -= nread;
7342 }
7343 close(dfd);
7344 if (rename(tmpfile,server.dbfilename) == -1) {
7345 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7346 unlink(tmpfile);
7347 close(fd);
7348 return REDIS_ERR;
7349 }
7350 emptyDb();
7351 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7352 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7353 close(fd);
7354 return REDIS_ERR;
7355 }
7356 server.master = createClient(fd);
7357 server.master->flags |= REDIS_MASTER;
7358 server.master->authenticated = 1;
7359 server.replstate = REDIS_REPL_CONNECTED;
7360 return REDIS_OK;
7361 }
7362
7363 static void slaveofCommand(redisClient *c) {
7364 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7365 !strcasecmp(c->argv[2]->ptr,"one")) {
7366 if (server.masterhost) {
7367 sdsfree(server.masterhost);
7368 server.masterhost = NULL;
7369 if (server.master) freeClient(server.master);
7370 server.replstate = REDIS_REPL_NONE;
7371 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7372 }
7373 } else {
7374 sdsfree(server.masterhost);
7375 server.masterhost = sdsdup(c->argv[1]->ptr);
7376 server.masterport = atoi(c->argv[2]->ptr);
7377 if (server.master) freeClient(server.master);
7378 server.replstate = REDIS_REPL_CONNECT;
7379 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7380 server.masterhost, server.masterport);
7381 }
7382 addReply(c,shared.ok);
7383 }
7384
7385 /* ============================ Maxmemory directive ======================== */
7386
7387 /* Try to free one object form the pre-allocated objects free list.
7388 * This is useful under low mem conditions as by default we take 1 million
7389 * free objects allocated. On success REDIS_OK is returned, otherwise
7390 * REDIS_ERR. */
7391 static int tryFreeOneObjectFromFreelist(void) {
7392 robj *o;
7393
7394 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7395 if (listLength(server.objfreelist)) {
7396 listNode *head = listFirst(server.objfreelist);
7397 o = listNodeValue(head);
7398 listDelNode(server.objfreelist,head);
7399 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7400 zfree(o);
7401 return REDIS_OK;
7402 } else {
7403 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7404 return REDIS_ERR;
7405 }
7406 }
7407
7408 /* This function gets called when 'maxmemory' is set on the config file to limit
7409 * the max memory used by the server, and we are out of memory.
7410 * This function will try to, in order:
7411 *
7412 * - Free objects from the free list
7413 * - Try to remove keys with an EXPIRE set
7414 *
7415 * It is not possible to free enough memory to reach used-memory < maxmemory
7416 * the server will start refusing commands that will enlarge even more the
7417 * memory usage.
7418 */
7419 static void freeMemoryIfNeeded(void) {
7420 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7421 int j, k, freed = 0;
7422
7423 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7424 for (j = 0; j < server.dbnum; j++) {
7425 int minttl = -1;
7426 robj *minkey = NULL;
7427 struct dictEntry *de;
7428
7429 if (dictSize(server.db[j].expires)) {
7430 freed = 1;
7431 /* From a sample of three keys drop the one nearest to
7432 * the natural expire */
7433 for (k = 0; k < 3; k++) {
7434 time_t t;
7435
7436 de = dictGetRandomKey(server.db[j].expires);
7437 t = (time_t) dictGetEntryVal(de);
7438 if (minttl == -1 || t < minttl) {
7439 minkey = dictGetEntryKey(de);
7440 minttl = t;
7441 }
7442 }
7443 deleteKey(server.db+j,minkey);
7444 }
7445 }
7446 if (!freed) return; /* nothing to free... */
7447 }
7448 }
7449
7450 /* ============================== Append Only file ========================== */
7451
7452 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7453 sds buf = sdsempty();
7454 int j;
7455 ssize_t nwritten;
7456 time_t now;
7457 robj *tmpargv[3];
7458
7459 /* The DB this command was targetting is not the same as the last command
7460 * we appendend. To issue a SELECT command is needed. */
7461 if (dictid != server.appendseldb) {
7462 char seldb[64];
7463
7464 snprintf(seldb,sizeof(seldb),"%d",dictid);
7465 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7466 (unsigned long)strlen(seldb),seldb);
7467 server.appendseldb = dictid;
7468 }
7469
7470 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7471 * EXPIREs into EXPIREATs calls */
7472 if (cmd->proc == expireCommand) {
7473 long when;
7474
7475 tmpargv[0] = createStringObject("EXPIREAT",8);
7476 tmpargv[1] = argv[1];
7477 incrRefCount(argv[1]);
7478 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7479 tmpargv[2] = createObject(REDIS_STRING,
7480 sdscatprintf(sdsempty(),"%ld",when));
7481 argv = tmpargv;
7482 }
7483
7484 /* Append the actual command */
7485 buf = sdscatprintf(buf,"*%d\r\n",argc);
7486 for (j = 0; j < argc; j++) {
7487 robj *o = argv[j];
7488
7489 o = getDecodedObject(o);
7490 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7491 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7492 buf = sdscatlen(buf,"\r\n",2);
7493 decrRefCount(o);
7494 }
7495
7496 /* Free the objects from the modified argv for EXPIREAT */
7497 if (cmd->proc == expireCommand) {
7498 for (j = 0; j < 3; j++)
7499 decrRefCount(argv[j]);
7500 }
7501
7502 /* We want to perform a single write. This should be guaranteed atomic
7503 * at least if the filesystem we are writing is a real physical one.
7504 * While this will save us against the server being killed I don't think
7505 * there is much to do about the whole server stopping for power problems
7506 * or alike */
7507 nwritten = write(server.appendfd,buf,sdslen(buf));
7508 if (nwritten != (signed)sdslen(buf)) {
7509 /* Ooops, we are in troubles. The best thing to do for now is
7510 * to simply exit instead to give the illusion that everything is
7511 * working as expected. */
7512 if (nwritten == -1) {
7513 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7514 } else {
7515 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7516 }
7517 exit(1);
7518 }
7519 /* If a background append only file rewriting is in progress we want to
7520 * accumulate the differences between the child DB and the current one
7521 * in a buffer, so that when the child process will do its work we
7522 * can append the differences to the new append only file. */
7523 if (server.bgrewritechildpid != -1)
7524 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7525
7526 sdsfree(buf);
7527 now = time(NULL);
7528 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7529 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7530 now-server.lastfsync > 1))
7531 {
7532 fsync(server.appendfd); /* Let's try to get this data on the disk */
7533 server.lastfsync = now;
7534 }
7535 }
7536
7537 /* In Redis commands are always executed in the context of a client, so in
7538 * order to load the append only file we need to create a fake client. */
7539 static struct redisClient *createFakeClient(void) {
7540 struct redisClient *c = zmalloc(sizeof(*c));
7541
7542 selectDb(c,0);
7543 c->fd = -1;
7544 c->querybuf = sdsempty();
7545 c->argc = 0;
7546 c->argv = NULL;
7547 c->flags = 0;
7548 /* We set the fake client as a slave waiting for the synchronization
7549 * so that Redis will not try to send replies to this client. */
7550 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7551 c->reply = listCreate();
7552 listSetFreeMethod(c->reply,decrRefCount);
7553 listSetDupMethod(c->reply,dupClientReplyValue);
7554 return c;
7555 }
7556
7557 static void freeFakeClient(struct redisClient *c) {
7558 sdsfree(c->querybuf);
7559 listRelease(c->reply);
7560 zfree(c);
7561 }
7562
7563 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7564 * error (the append only file is zero-length) REDIS_ERR is returned. On
7565 * fatal error an error message is logged and the program exists. */
7566 int loadAppendOnlyFile(char *filename) {
7567 struct redisClient *fakeClient;
7568 FILE *fp = fopen(filename,"r");
7569 struct redis_stat sb;
7570 unsigned long long loadedkeys = 0;
7571
7572 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7573 return REDIS_ERR;
7574
7575 if (fp == NULL) {
7576 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7577 exit(1);
7578 }
7579
7580 fakeClient = createFakeClient();
7581 while(1) {
7582 int argc, j;
7583 unsigned long len;
7584 robj **argv;
7585 char buf[128];
7586 sds argsds;
7587 struct redisCommand *cmd;
7588
7589 if (fgets(buf,sizeof(buf),fp) == NULL) {
7590 if (feof(fp))
7591 break;
7592 else
7593 goto readerr;
7594 }
7595 if (buf[0] != '*') goto fmterr;
7596 argc = atoi(buf+1);
7597 argv = zmalloc(sizeof(robj*)*argc);
7598 for (j = 0; j < argc; j++) {
7599 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7600 if (buf[0] != '$') goto fmterr;
7601 len = strtol(buf+1,NULL,10);
7602 argsds = sdsnewlen(NULL,len);
7603 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7604 argv[j] = createObject(REDIS_STRING,argsds);
7605 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7606 }
7607
7608 /* Command lookup */
7609 cmd = lookupCommand(argv[0]->ptr);
7610 if (!cmd) {
7611 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7612 exit(1);
7613 }
7614 /* Try object sharing and encoding */
7615 if (server.shareobjects) {
7616 int j;
7617 for(j = 1; j < argc; j++)
7618 argv[j] = tryObjectSharing(argv[j]);
7619 }
7620 if (cmd->flags & REDIS_CMD_BULK)
7621 tryObjectEncoding(argv[argc-1]);
7622 /* Run the command in the context of a fake client */
7623 fakeClient->argc = argc;
7624 fakeClient->argv = argv;
7625 cmd->proc(fakeClient);
7626 /* Discard the reply objects list from the fake client */
7627 while(listLength(fakeClient->reply))
7628 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7629 /* Clean up, ready for the next command */
7630 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7631 zfree(argv);
7632 /* Handle swapping while loading big datasets when VM is on */
7633 loadedkeys++;
7634 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7635 while (zmalloc_used_memory() > server.vm_max_memory) {
7636 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7637 }
7638 }
7639 }
7640 fclose(fp);
7641 freeFakeClient(fakeClient);
7642 return REDIS_OK;
7643
7644 readerr:
7645 if (feof(fp)) {
7646 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7647 } else {
7648 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7649 }
7650 exit(1);
7651 fmterr:
7652 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7653 exit(1);
7654 }
7655
7656 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7657 static int fwriteBulkObject(FILE *fp, robj *obj) {
7658 char buf[128];
7659 int decrrc = 0;
7660
7661 /* Avoid the incr/decr ref count business if possible to help
7662 * copy-on-write (we are often in a child process when this function
7663 * is called).
7664 * Also makes sure that key objects don't get incrRefCount-ed when VM
7665 * is enabled */
7666 if (obj->encoding != REDIS_ENCODING_RAW) {
7667 obj = getDecodedObject(obj);
7668 decrrc = 1;
7669 }
7670 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7671 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7672 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7673 goto err;
7674 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7675 if (decrrc) decrRefCount(obj);
7676 return 1;
7677 err:
7678 if (decrrc) decrRefCount(obj);
7679 return 0;
7680 }
7681
7682 /* Write binary-safe string into a file in the bulkformat
7683 * $<count>\r\n<payload>\r\n */
7684 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7685 char buf[128];
7686
7687 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7688 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7689 if (len && fwrite(s,len,1,fp) == 0) return 0;
7690 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7691 return 1;
7692 }
7693
7694 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7695 static int fwriteBulkDouble(FILE *fp, double d) {
7696 char buf[128], dbuf[128];
7697
7698 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7699 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7700 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7701 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7702 return 1;
7703 }
7704
7705 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7706 static int fwriteBulkLong(FILE *fp, long l) {
7707 char buf[128], lbuf[128];
7708
7709 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7710 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7711 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7712 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7713 return 1;
7714 }
7715
7716 /* Write a sequence of commands able to fully rebuild the dataset into
7717 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7718 static int rewriteAppendOnlyFile(char *filename) {
7719 dictIterator *di = NULL;
7720 dictEntry *de;
7721 FILE *fp;
7722 char tmpfile[256];
7723 int j;
7724 time_t now = time(NULL);
7725
7726 /* Note that we have to use a different temp name here compared to the
7727 * one used by rewriteAppendOnlyFileBackground() function. */
7728 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7729 fp = fopen(tmpfile,"w");
7730 if (!fp) {
7731 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7732 return REDIS_ERR;
7733 }
7734 for (j = 0; j < server.dbnum; j++) {
7735 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7736 redisDb *db = server.db+j;
7737 dict *d = db->dict;
7738 if (dictSize(d) == 0) continue;
7739 di = dictGetIterator(d);
7740 if (!di) {
7741 fclose(fp);
7742 return REDIS_ERR;
7743 }
7744
7745 /* SELECT the new DB */
7746 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7747 if (fwriteBulkLong(fp,j) == 0) goto werr;
7748
7749 /* Iterate this DB writing every entry */
7750 while((de = dictNext(di)) != NULL) {
7751 robj *key, *o;
7752 time_t expiretime;
7753 int swapped;
7754
7755 key = dictGetEntryKey(de);
7756 /* If the value for this key is swapped, load a preview in memory.
7757 * We use a "swapped" flag to remember if we need to free the
7758 * value object instead to just increment the ref count anyway
7759 * in order to avoid copy-on-write of pages if we are forked() */
7760 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7761 key->storage == REDIS_VM_SWAPPING) {
7762 o = dictGetEntryVal(de);
7763 swapped = 0;
7764 } else {
7765 o = vmPreviewObject(key);
7766 swapped = 1;
7767 }
7768 expiretime = getExpire(db,key);
7769
7770 /* Save the key and associated value */
7771 if (o->type == REDIS_STRING) {
7772 /* Emit a SET command */
7773 char cmd[]="*3\r\n$3\r\nSET\r\n";
7774 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7775 /* Key and value */
7776 if (fwriteBulkObject(fp,key) == 0) goto werr;
7777 if (fwriteBulkObject(fp,o) == 0) goto werr;
7778 } else if (o->type == REDIS_LIST) {
7779 /* Emit the RPUSHes needed to rebuild the list */
7780 list *list = o->ptr;
7781 listNode *ln;
7782 listIter li;
7783
7784 listRewind(list,&li);
7785 while((ln = listNext(&li))) {
7786 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7787 robj *eleobj = listNodeValue(ln);
7788
7789 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7790 if (fwriteBulkObject(fp,key) == 0) goto werr;
7791 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7792 }
7793 } else if (o->type == REDIS_SET) {
7794 /* Emit the SADDs needed to rebuild the set */
7795 dict *set = o->ptr;
7796 dictIterator *di = dictGetIterator(set);
7797 dictEntry *de;
7798
7799 while((de = dictNext(di)) != NULL) {
7800 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7801 robj *eleobj = dictGetEntryKey(de);
7802
7803 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7804 if (fwriteBulkObject(fp,key) == 0) goto werr;
7805 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7806 }
7807 dictReleaseIterator(di);
7808 } else if (o->type == REDIS_ZSET) {
7809 /* Emit the ZADDs needed to rebuild the sorted set */
7810 zset *zs = o->ptr;
7811 dictIterator *di = dictGetIterator(zs->dict);
7812 dictEntry *de;
7813
7814 while((de = dictNext(di)) != NULL) {
7815 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7816 robj *eleobj = dictGetEntryKey(de);
7817 double *score = dictGetEntryVal(de);
7818
7819 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7820 if (fwriteBulkObject(fp,key) == 0) goto werr;
7821 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7822 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7823 }
7824 dictReleaseIterator(di);
7825 } else if (o->type == REDIS_HASH) {
7826 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7827
7828 /* Emit the HSETs needed to rebuild the hash */
7829 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7830 unsigned char *p = zipmapRewind(o->ptr);
7831 unsigned char *field, *val;
7832 unsigned int flen, vlen;
7833
7834 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7835 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7836 if (fwriteBulkObject(fp,key) == 0) goto werr;
7837 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7838 return -1;
7839 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7840 return -1;
7841 }
7842 } else {
7843 dictIterator *di = dictGetIterator(o->ptr);
7844 dictEntry *de;
7845
7846 while((de = dictNext(di)) != NULL) {
7847 robj *field = dictGetEntryKey(de);
7848 robj *val = dictGetEntryVal(de);
7849
7850 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7851 if (fwriteBulkObject(fp,key) == 0) goto werr;
7852 if (fwriteBulkObject(fp,field) == -1) return -1;
7853 if (fwriteBulkObject(fp,val) == -1) return -1;
7854 }
7855 dictReleaseIterator(di);
7856 }
7857 } else {
7858 redisAssert(0 != 0);
7859 }
7860 /* Save the expire time */
7861 if (expiretime != -1) {
7862 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7863 /* If this key is already expired skip it */
7864 if (expiretime < now) continue;
7865 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7866 if (fwriteBulkObject(fp,key) == 0) goto werr;
7867 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7868 }
7869 if (swapped) decrRefCount(o);
7870 }
7871 dictReleaseIterator(di);
7872 }
7873
7874 /* Make sure data will not remain on the OS's output buffers */
7875 fflush(fp);
7876 fsync(fileno(fp));
7877 fclose(fp);
7878
7879 /* Use RENAME to make sure the DB file is changed atomically only
7880 * if the generate DB file is ok. */
7881 if (rename(tmpfile,filename) == -1) {
7882 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
7883 unlink(tmpfile);
7884 return REDIS_ERR;
7885 }
7886 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
7887 return REDIS_OK;
7888
7889 werr:
7890 fclose(fp);
7891 unlink(tmpfile);
7892 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
7893 if (di) dictReleaseIterator(di);
7894 return REDIS_ERR;
7895 }
7896
7897 /* This is how rewriting of the append only file in background works:
7898 *
7899 * 1) The user calls BGREWRITEAOF
7900 * 2) Redis calls this function, that forks():
7901 * 2a) the child rewrite the append only file in a temp file.
7902 * 2b) the parent accumulates differences in server.bgrewritebuf.
7903 * 3) When the child finished '2a' exists.
7904 * 4) The parent will trap the exit code, if it's OK, will append the
7905 * data accumulated into server.bgrewritebuf into the temp file, and
7906 * finally will rename(2) the temp file in the actual file name.
7907 * The the new file is reopened as the new append only file. Profit!
7908 */
7909 static int rewriteAppendOnlyFileBackground(void) {
7910 pid_t childpid;
7911
7912 if (server.bgrewritechildpid != -1) return REDIS_ERR;
7913 if (server.vm_enabled) waitEmptyIOJobsQueue();
7914 if ((childpid = fork()) == 0) {
7915 /* Child */
7916 char tmpfile[256];
7917
7918 if (server.vm_enabled) vmReopenSwapFile();
7919 close(server.fd);
7920 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
7921 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
7922 _exit(0);
7923 } else {
7924 _exit(1);
7925 }
7926 } else {
7927 /* Parent */
7928 if (childpid == -1) {
7929 redisLog(REDIS_WARNING,
7930 "Can't rewrite append only file in background: fork: %s",
7931 strerror(errno));
7932 return REDIS_ERR;
7933 }
7934 redisLog(REDIS_NOTICE,
7935 "Background append only file rewriting started by pid %d",childpid);
7936 server.bgrewritechildpid = childpid;
7937 /* We set appendseldb to -1 in order to force the next call to the
7938 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7939 * accumulated by the parent into server.bgrewritebuf will start
7940 * with a SELECT statement and it will be safe to merge. */
7941 server.appendseldb = -1;
7942 return REDIS_OK;
7943 }
7944 return REDIS_OK; /* unreached */
7945 }
7946
7947 static void bgrewriteaofCommand(redisClient *c) {
7948 if (server.bgrewritechildpid != -1) {
7949 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7950 return;
7951 }
7952 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
7953 char *status = "+Background append only file rewriting started\r\n";
7954 addReplySds(c,sdsnew(status));
7955 } else {
7956 addReply(c,shared.err);
7957 }
7958 }
7959
7960 static void aofRemoveTempFile(pid_t childpid) {
7961 char tmpfile[256];
7962
7963 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7964 unlink(tmpfile);
7965 }
7966
7967 /* Virtual Memory is composed mainly of two subsystems:
7968 * - Blocking Virutal Memory
7969 * - Threaded Virtual Memory I/O
7970 * The two parts are not fully decoupled, but functions are split among two
7971 * different sections of the source code (delimited by comments) in order to
7972 * make more clear what functionality is about the blocking VM and what about
7973 * the threaded (not blocking) VM.
7974 *
7975 * Redis VM design:
7976 *
7977 * Redis VM is a blocking VM (one that blocks reading swapped values from
7978 * disk into memory when a value swapped out is needed in memory) that is made
7979 * unblocking by trying to examine the command argument vector in order to
7980 * load in background values that will likely be needed in order to exec
7981 * the command. The command is executed only once all the relevant keys
7982 * are loaded into memory.
7983 *
7984 * This basically is almost as simple of a blocking VM, but almost as parallel
7985 * as a fully non-blocking VM.
7986 */
7987
7988 /* =================== Virtual Memory - Blocking Side ====================== */
7989
7990 /* substitute the first occurrence of '%p' with the process pid in the
7991 * swap file name. */
7992 static void expandVmSwapFilename(void) {
7993 char *p = strstr(server.vm_swap_file,"%p");
7994 sds new;
7995
7996 if (!p) return;
7997 new = sdsempty();
7998 *p = '\0';
7999 new = sdscat(new,server.vm_swap_file);
8000 new = sdscatprintf(new,"%ld",(long) getpid());
8001 new = sdscat(new,p+2);
8002 zfree(server.vm_swap_file);
8003 server.vm_swap_file = new;
8004 }
8005
8006 static void vmInit(void) {
8007 off_t totsize;
8008 int pipefds[2];
8009 size_t stacksize;
8010
8011 if (server.vm_max_threads != 0)
8012 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8013
8014 expandVmSwapFilename();
8015 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8016 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8017 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8018 }
8019 if (server.vm_fp == NULL) {
8020 redisLog(REDIS_WARNING,
8021 "Impossible to open the swap file: %s. Exiting.",
8022 strerror(errno));
8023 exit(1);
8024 }
8025 server.vm_fd = fileno(server.vm_fp);
8026 server.vm_next_page = 0;
8027 server.vm_near_pages = 0;
8028 server.vm_stats_used_pages = 0;
8029 server.vm_stats_swapped_objects = 0;
8030 server.vm_stats_swapouts = 0;
8031 server.vm_stats_swapins = 0;
8032 totsize = server.vm_pages*server.vm_page_size;
8033 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8034 if (ftruncate(server.vm_fd,totsize) == -1) {
8035 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8036 strerror(errno));
8037 exit(1);
8038 } else {
8039 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8040 }
8041 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8042 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8043 (long long) (server.vm_pages+7)/8, server.vm_pages);
8044 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8045
8046 /* Initialize threaded I/O (used by Virtual Memory) */
8047 server.io_newjobs = listCreate();
8048 server.io_processing = listCreate();
8049 server.io_processed = listCreate();
8050 server.io_ready_clients = listCreate();
8051 pthread_mutex_init(&server.io_mutex,NULL);
8052 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8053 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8054 server.io_active_threads = 0;
8055 if (pipe(pipefds) == -1) {
8056 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8057 ,strerror(errno));
8058 exit(1);
8059 }
8060 server.io_ready_pipe_read = pipefds[0];
8061 server.io_ready_pipe_write = pipefds[1];
8062 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8063 /* LZF requires a lot of stack */
8064 pthread_attr_init(&server.io_threads_attr);
8065 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8066 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8067 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8068 /* Listen for events in the threaded I/O pipe */
8069 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8070 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8071 oom("creating file event");
8072 }
8073
8074 /* Mark the page as used */
8075 static void vmMarkPageUsed(off_t page) {
8076 off_t byte = page/8;
8077 int bit = page&7;
8078 redisAssert(vmFreePage(page) == 1);
8079 server.vm_bitmap[byte] |= 1<<bit;
8080 }
8081
8082 /* Mark N contiguous pages as used, with 'page' being the first. */
8083 static void vmMarkPagesUsed(off_t page, off_t count) {
8084 off_t j;
8085
8086 for (j = 0; j < count; j++)
8087 vmMarkPageUsed(page+j);
8088 server.vm_stats_used_pages += count;
8089 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8090 (long long)count, (long long)page);
8091 }
8092
8093 /* Mark the page as free */
8094 static void vmMarkPageFree(off_t page) {
8095 off_t byte = page/8;
8096 int bit = page&7;
8097 redisAssert(vmFreePage(page) == 0);
8098 server.vm_bitmap[byte] &= ~(1<<bit);
8099 }
8100
8101 /* Mark N contiguous pages as free, with 'page' being the first. */
8102 static void vmMarkPagesFree(off_t page, off_t count) {
8103 off_t j;
8104
8105 for (j = 0; j < count; j++)
8106 vmMarkPageFree(page+j);
8107 server.vm_stats_used_pages -= count;
8108 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8109 (long long)count, (long long)page);
8110 }
8111
8112 /* Test if the page is free */
8113 static int vmFreePage(off_t page) {
8114 off_t byte = page/8;
8115 int bit = page&7;
8116 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8117 }
8118
8119 /* Find N contiguous free pages storing the first page of the cluster in *first.
8120 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8121 * REDIS_ERR is returned.
8122 *
8123 * This function uses a simple algorithm: we try to allocate
8124 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8125 * again from the start of the swap file searching for free spaces.
8126 *
8127 * If it looks pretty clear that there are no free pages near our offset
8128 * we try to find less populated places doing a forward jump of
8129 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8130 * without hurry, and then we jump again and so forth...
8131 *
8132 * This function can be improved using a free list to avoid to guess
8133 * too much, since we could collect data about freed pages.
8134 *
8135 * note: I implemented this function just after watching an episode of
8136 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8137 */
8138 static int vmFindContiguousPages(off_t *first, off_t n) {
8139 off_t base, offset = 0, since_jump = 0, numfree = 0;
8140
8141 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8142 server.vm_near_pages = 0;
8143 server.vm_next_page = 0;
8144 }
8145 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8146 base = server.vm_next_page;
8147
8148 while(offset < server.vm_pages) {
8149 off_t this = base+offset;
8150
8151 /* If we overflow, restart from page zero */
8152 if (this >= server.vm_pages) {
8153 this -= server.vm_pages;
8154 if (this == 0) {
8155 /* Just overflowed, what we found on tail is no longer
8156 * interesting, as it's no longer contiguous. */
8157 numfree = 0;
8158 }
8159 }
8160 if (vmFreePage(this)) {
8161 /* This is a free page */
8162 numfree++;
8163 /* Already got N free pages? Return to the caller, with success */
8164 if (numfree == n) {
8165 *first = this-(n-1);
8166 server.vm_next_page = this+1;
8167 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8168 return REDIS_OK;
8169 }
8170 } else {
8171 /* The current one is not a free page */
8172 numfree = 0;
8173 }
8174
8175 /* Fast-forward if the current page is not free and we already
8176 * searched enough near this place. */
8177 since_jump++;
8178 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8179 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8180 since_jump = 0;
8181 /* Note that even if we rewind after the jump, we are don't need
8182 * to make sure numfree is set to zero as we only jump *if* it
8183 * is set to zero. */
8184 } else {
8185 /* Otherwise just check the next page */
8186 offset++;
8187 }
8188 }
8189 return REDIS_ERR;
8190 }
8191
8192 /* Write the specified object at the specified page of the swap file */
8193 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8194 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8195 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8196 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8197 redisLog(REDIS_WARNING,
8198 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8199 strerror(errno));
8200 return REDIS_ERR;
8201 }
8202 rdbSaveObject(server.vm_fp,o);
8203 fflush(server.vm_fp);
8204 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8205 return REDIS_OK;
8206 }
8207
8208 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8209 * needed to later retrieve the object into the key object.
8210 * If we can't find enough contiguous empty pages to swap the object on disk
8211 * REDIS_ERR is returned. */
8212 static int vmSwapObjectBlocking(robj *key, robj *val) {
8213 off_t pages = rdbSavedObjectPages(val,NULL);
8214 off_t page;
8215
8216 assert(key->storage == REDIS_VM_MEMORY);
8217 assert(key->refcount == 1);
8218 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8219 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8220 key->vm.page = page;
8221 key->vm.usedpages = pages;
8222 key->storage = REDIS_VM_SWAPPED;
8223 key->vtype = val->type;
8224 decrRefCount(val); /* Deallocate the object from memory. */
8225 vmMarkPagesUsed(page,pages);
8226 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8227 (unsigned char*) key->ptr,
8228 (unsigned long long) page, (unsigned long long) pages);
8229 server.vm_stats_swapped_objects++;
8230 server.vm_stats_swapouts++;
8231 return REDIS_OK;
8232 }
8233
8234 static robj *vmReadObjectFromSwap(off_t page, int type) {
8235 robj *o;
8236
8237 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8238 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8239 redisLog(REDIS_WARNING,
8240 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8241 strerror(errno));
8242 _exit(1);
8243 }
8244 o = rdbLoadObject(type,server.vm_fp);
8245 if (o == NULL) {
8246 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8247 _exit(1);
8248 }
8249 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8250 return o;
8251 }
8252
8253 /* Load the value object relative to the 'key' object from swap to memory.
8254 * The newly allocated object is returned.
8255 *
8256 * If preview is true the unserialized object is returned to the caller but
8257 * no changes are made to the key object, nor the pages are marked as freed */
8258 static robj *vmGenericLoadObject(robj *key, int preview) {
8259 robj *val;
8260
8261 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8262 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8263 if (!preview) {
8264 key->storage = REDIS_VM_MEMORY;
8265 key->vm.atime = server.unixtime;
8266 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8267 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8268 (unsigned char*) key->ptr);
8269 server.vm_stats_swapped_objects--;
8270 } else {
8271 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8272 (unsigned char*) key->ptr);
8273 }
8274 server.vm_stats_swapins++;
8275 return val;
8276 }
8277
8278 /* Plain object loading, from swap to memory */
8279 static robj *vmLoadObject(robj *key) {
8280 /* If we are loading the object in background, stop it, we
8281 * need to load this object synchronously ASAP. */
8282 if (key->storage == REDIS_VM_LOADING)
8283 vmCancelThreadedIOJob(key);
8284 return vmGenericLoadObject(key,0);
8285 }
8286
8287 /* Just load the value on disk, without to modify the key.
8288 * This is useful when we want to perform some operation on the value
8289 * without to really bring it from swap to memory, like while saving the
8290 * dataset or rewriting the append only log. */
8291 static robj *vmPreviewObject(robj *key) {
8292 return vmGenericLoadObject(key,1);
8293 }
8294
8295 /* How a good candidate is this object for swapping?
8296 * The better candidate it is, the greater the returned value.
8297 *
8298 * Currently we try to perform a fast estimation of the object size in
8299 * memory, and combine it with aging informations.
8300 *
8301 * Basically swappability = idle-time * log(estimated size)
8302 *
8303 * Bigger objects are preferred over smaller objects, but not
8304 * proportionally, this is why we use the logarithm. This algorithm is
8305 * just a first try and will probably be tuned later. */
8306 static double computeObjectSwappability(robj *o) {
8307 time_t age = server.unixtime - o->vm.atime;
8308 long asize = 0;
8309 list *l;
8310 dict *d;
8311 struct dictEntry *de;
8312 int z;
8313
8314 if (age <= 0) return 0;
8315 switch(o->type) {
8316 case REDIS_STRING:
8317 if (o->encoding != REDIS_ENCODING_RAW) {
8318 asize = sizeof(*o);
8319 } else {
8320 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8321 }
8322 break;
8323 case REDIS_LIST:
8324 l = o->ptr;
8325 listNode *ln = listFirst(l);
8326
8327 asize = sizeof(list);
8328 if (ln) {
8329 robj *ele = ln->value;
8330 long elesize;
8331
8332 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8333 (sizeof(*o)+sdslen(ele->ptr)) :
8334 sizeof(*o);
8335 asize += (sizeof(listNode)+elesize)*listLength(l);
8336 }
8337 break;
8338 case REDIS_SET:
8339 case REDIS_ZSET:
8340 z = (o->type == REDIS_ZSET);
8341 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8342
8343 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8344 if (z) asize += sizeof(zset)-sizeof(dict);
8345 if (dictSize(d)) {
8346 long elesize;
8347 robj *ele;
8348
8349 de = dictGetRandomKey(d);
8350 ele = dictGetEntryKey(de);
8351 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8352 (sizeof(*o)+sdslen(ele->ptr)) :
8353 sizeof(*o);
8354 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8355 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8356 }
8357 break;
8358 }
8359 return (double)age*log(1+asize);
8360 }
8361
8362 /* Try to swap an object that's a good candidate for swapping.
8363 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8364 * to swap any object at all.
8365 *
8366 * If 'usethreaded' is true, Redis will try to swap the object in background
8367 * using I/O threads. */
8368 static int vmSwapOneObject(int usethreads) {
8369 int j, i;
8370 struct dictEntry *best = NULL;
8371 double best_swappability = 0;
8372 redisDb *best_db = NULL;
8373 robj *key, *val;
8374
8375 for (j = 0; j < server.dbnum; j++) {
8376 redisDb *db = server.db+j;
8377 /* Why maxtries is set to 100?
8378 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8379 * are swappable objects */
8380 int maxtries = 100;
8381
8382 if (dictSize(db->dict) == 0) continue;
8383 for (i = 0; i < 5; i++) {
8384 dictEntry *de;
8385 double swappability;
8386
8387 if (maxtries) maxtries--;
8388 de = dictGetRandomKey(db->dict);
8389 key = dictGetEntryKey(de);
8390 val = dictGetEntryVal(de);
8391 /* Only swap objects that are currently in memory.
8392 *
8393 * Also don't swap shared objects if threaded VM is on, as we
8394 * try to ensure that the main thread does not touch the
8395 * object while the I/O thread is using it, but we can't
8396 * control other keys without adding additional mutex. */
8397 if (key->storage != REDIS_VM_MEMORY ||
8398 (server.vm_max_threads != 0 && val->refcount != 1)) {
8399 if (maxtries) i--; /* don't count this try */
8400 continue;
8401 }
8402 swappability = computeObjectSwappability(val);
8403 if (!best || swappability > best_swappability) {
8404 best = de;
8405 best_swappability = swappability;
8406 best_db = db;
8407 }
8408 }
8409 }
8410 if (best == NULL) return REDIS_ERR;
8411 key = dictGetEntryKey(best);
8412 val = dictGetEntryVal(best);
8413
8414 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8415 key->ptr, best_swappability);
8416
8417 /* Unshare the key if needed */
8418 if (key->refcount > 1) {
8419 robj *newkey = dupStringObject(key);
8420 decrRefCount(key);
8421 key = dictGetEntryKey(best) = newkey;
8422 }
8423 /* Swap it */
8424 if (usethreads) {
8425 vmSwapObjectThreaded(key,val,best_db);
8426 return REDIS_OK;
8427 } else {
8428 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8429 dictGetEntryVal(best) = NULL;
8430 return REDIS_OK;
8431 } else {
8432 return REDIS_ERR;
8433 }
8434 }
8435 }
8436
8437 static int vmSwapOneObjectBlocking() {
8438 return vmSwapOneObject(0);
8439 }
8440
8441 static int vmSwapOneObjectThreaded() {
8442 return vmSwapOneObject(1);
8443 }
8444
8445 /* Return true if it's safe to swap out objects in a given moment.
8446 * Basically we don't want to swap objects out while there is a BGSAVE
8447 * or a BGAEOREWRITE running in backgroud. */
8448 static int vmCanSwapOut(void) {
8449 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8450 }
8451
8452 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8453 * and was deleted. Otherwise 0 is returned. */
8454 static int deleteIfSwapped(redisDb *db, robj *key) {
8455 dictEntry *de;
8456 robj *foundkey;
8457
8458 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8459 foundkey = dictGetEntryKey(de);
8460 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8461 deleteKey(db,key);
8462 return 1;
8463 }
8464
8465 /* =================== Virtual Memory - Threaded I/O ======================= */
8466
8467 static void freeIOJob(iojob *j) {
8468 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8469 j->type == REDIS_IOJOB_DO_SWAP ||
8470 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8471 decrRefCount(j->val);
8472 decrRefCount(j->key);
8473 zfree(j);
8474 }
8475
8476 /* Every time a thread finished a Job, it writes a byte into the write side
8477 * of an unix pipe in order to "awake" the main thread, and this function
8478 * is called. */
8479 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8480 int mask)
8481 {
8482 char buf[1];
8483 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8484 REDIS_NOTUSED(el);
8485 REDIS_NOTUSED(mask);
8486 REDIS_NOTUSED(privdata);
8487
8488 /* For every byte we read in the read side of the pipe, there is one
8489 * I/O job completed to process. */
8490 while((retval = read(fd,buf,1)) == 1) {
8491 iojob *j;
8492 listNode *ln;
8493 robj *key;
8494 struct dictEntry *de;
8495
8496 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8497
8498 /* Get the processed element (the oldest one) */
8499 lockThreadedIO();
8500 assert(listLength(server.io_processed) != 0);
8501 if (toprocess == -1) {
8502 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8503 if (toprocess <= 0) toprocess = 1;
8504 }
8505 ln = listFirst(server.io_processed);
8506 j = ln->value;
8507 listDelNode(server.io_processed,ln);
8508 unlockThreadedIO();
8509 /* If this job is marked as canceled, just ignore it */
8510 if (j->canceled) {
8511 freeIOJob(j);
8512 continue;
8513 }
8514 /* Post process it in the main thread, as there are things we
8515 * can do just here to avoid race conditions and/or invasive locks */
8516 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8517 de = dictFind(j->db->dict,j->key);
8518 assert(de != NULL);
8519 key = dictGetEntryKey(de);
8520 if (j->type == REDIS_IOJOB_LOAD) {
8521 redisDb *db;
8522
8523 /* Key loaded, bring it at home */
8524 key->storage = REDIS_VM_MEMORY;
8525 key->vm.atime = server.unixtime;
8526 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8527 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8528 (unsigned char*) key->ptr);
8529 server.vm_stats_swapped_objects--;
8530 server.vm_stats_swapins++;
8531 dictGetEntryVal(de) = j->val;
8532 incrRefCount(j->val);
8533 db = j->db;
8534 freeIOJob(j);
8535 /* Handle clients waiting for this key to be loaded. */
8536 handleClientsBlockedOnSwappedKey(db,key);
8537 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8538 /* Now we know the amount of pages required to swap this object.
8539 * Let's find some space for it, and queue this task again
8540 * rebranded as REDIS_IOJOB_DO_SWAP. */
8541 if (!vmCanSwapOut() ||
8542 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8543 {
8544 /* Ooops... no space or we can't swap as there is
8545 * a fork()ed Redis trying to save stuff on disk. */
8546 freeIOJob(j);
8547 key->storage = REDIS_VM_MEMORY; /* undo operation */
8548 } else {
8549 /* Note that we need to mark this pages as used now,
8550 * if the job will be canceled, we'll mark them as freed
8551 * again. */
8552 vmMarkPagesUsed(j->page,j->pages);
8553 j->type = REDIS_IOJOB_DO_SWAP;
8554 lockThreadedIO();
8555 queueIOJob(j);
8556 unlockThreadedIO();
8557 }
8558 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8559 robj *val;
8560
8561 /* Key swapped. We can finally free some memory. */
8562 if (key->storage != REDIS_VM_SWAPPING) {
8563 printf("key->storage: %d\n",key->storage);
8564 printf("key->name: %s\n",(char*)key->ptr);
8565 printf("key->refcount: %d\n",key->refcount);
8566 printf("val: %p\n",(void*)j->val);
8567 printf("val->type: %d\n",j->val->type);
8568 printf("val->ptr: %s\n",(char*)j->val->ptr);
8569 }
8570 redisAssert(key->storage == REDIS_VM_SWAPPING);
8571 val = dictGetEntryVal(de);
8572 key->vm.page = j->page;
8573 key->vm.usedpages = j->pages;
8574 key->storage = REDIS_VM_SWAPPED;
8575 key->vtype = j->val->type;
8576 decrRefCount(val); /* Deallocate the object from memory. */
8577 dictGetEntryVal(de) = NULL;
8578 redisLog(REDIS_DEBUG,
8579 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8580 (unsigned char*) key->ptr,
8581 (unsigned long long) j->page, (unsigned long long) j->pages);
8582 server.vm_stats_swapped_objects++;
8583 server.vm_stats_swapouts++;
8584 freeIOJob(j);
8585 /* Put a few more swap requests in queue if we are still
8586 * out of memory */
8587 if (trytoswap && vmCanSwapOut() &&
8588 zmalloc_used_memory() > server.vm_max_memory)
8589 {
8590 int more = 1;
8591 while(more) {
8592 lockThreadedIO();
8593 more = listLength(server.io_newjobs) <
8594 (unsigned) server.vm_max_threads;
8595 unlockThreadedIO();
8596 /* Don't waste CPU time if swappable objects are rare. */
8597 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8598 trytoswap = 0;
8599 break;
8600 }
8601 }
8602 }
8603 }
8604 processed++;
8605 if (processed == toprocess) return;
8606 }
8607 if (retval < 0 && errno != EAGAIN) {
8608 redisLog(REDIS_WARNING,
8609 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8610 strerror(errno));
8611 }
8612 }
8613
8614 static void lockThreadedIO(void) {
8615 pthread_mutex_lock(&server.io_mutex);
8616 }
8617
8618 static void unlockThreadedIO(void) {
8619 pthread_mutex_unlock(&server.io_mutex);
8620 }
8621
8622 /* Remove the specified object from the threaded I/O queue if still not
8623 * processed, otherwise make sure to flag it as canceled. */
8624 static void vmCancelThreadedIOJob(robj *o) {
8625 list *lists[3] = {
8626 server.io_newjobs, /* 0 */
8627 server.io_processing, /* 1 */
8628 server.io_processed /* 2 */
8629 };
8630 int i;
8631
8632 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8633 again:
8634 lockThreadedIO();
8635 /* Search for a matching key in one of the queues */
8636 for (i = 0; i < 3; i++) {
8637 listNode *ln;
8638 listIter li;
8639
8640 listRewind(lists[i],&li);
8641 while ((ln = listNext(&li)) != NULL) {
8642 iojob *job = ln->value;
8643
8644 if (job->canceled) continue; /* Skip this, already canceled. */
8645 if (compareStringObjects(job->key,o) == 0) {
8646 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8647 (void*)job, (char*)o->ptr, job->type, i);
8648 /* Mark the pages as free since the swap didn't happened
8649 * or happened but is now discarded. */
8650 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8651 vmMarkPagesFree(job->page,job->pages);
8652 /* Cancel the job. It depends on the list the job is
8653 * living in. */
8654 switch(i) {
8655 case 0: /* io_newjobs */
8656 /* If the job was yet not processed the best thing to do
8657 * is to remove it from the queue at all */
8658 freeIOJob(job);
8659 listDelNode(lists[i],ln);
8660 break;
8661 case 1: /* io_processing */
8662 /* Oh Shi- the thread is messing with the Job:
8663 *
8664 * Probably it's accessing the object if this is a
8665 * PREPARE_SWAP or DO_SWAP job.
8666 * If it's a LOAD job it may be reading from disk and
8667 * if we don't wait for the job to terminate before to
8668 * cancel it, maybe in a few microseconds data can be
8669 * corrupted in this pages. So the short story is:
8670 *
8671 * Better to wait for the job to move into the
8672 * next queue (processed)... */
8673
8674 /* We try again and again until the job is completed. */
8675 unlockThreadedIO();
8676 /* But let's wait some time for the I/O thread
8677 * to finish with this job. After all this condition
8678 * should be very rare. */
8679 usleep(1);
8680 goto again;
8681 case 2: /* io_processed */
8682 /* The job was already processed, that's easy...
8683 * just mark it as canceled so that we'll ignore it
8684 * when processing completed jobs. */
8685 job->canceled = 1;
8686 break;
8687 }
8688 /* Finally we have to adjust the storage type of the object
8689 * in order to "UNDO" the operaiton. */
8690 if (o->storage == REDIS_VM_LOADING)
8691 o->storage = REDIS_VM_SWAPPED;
8692 else if (o->storage == REDIS_VM_SWAPPING)
8693 o->storage = REDIS_VM_MEMORY;
8694 unlockThreadedIO();
8695 return;
8696 }
8697 }
8698 }
8699 unlockThreadedIO();
8700 assert(1 != 1); /* We should never reach this */
8701 }
8702
8703 static void *IOThreadEntryPoint(void *arg) {
8704 iojob *j;
8705 listNode *ln;
8706 REDIS_NOTUSED(arg);
8707
8708 pthread_detach(pthread_self());
8709 while(1) {
8710 /* Get a new job to process */
8711 lockThreadedIO();
8712 if (listLength(server.io_newjobs) == 0) {
8713 /* No new jobs in queue, exit. */
8714 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8715 (long) pthread_self());
8716 server.io_active_threads--;
8717 unlockThreadedIO();
8718 return NULL;
8719 }
8720 ln = listFirst(server.io_newjobs);
8721 j = ln->value;
8722 listDelNode(server.io_newjobs,ln);
8723 /* Add the job in the processing queue */
8724 j->thread = pthread_self();
8725 listAddNodeTail(server.io_processing,j);
8726 ln = listLast(server.io_processing); /* We use ln later to remove it */
8727 unlockThreadedIO();
8728 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8729 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8730
8731 /* Process the Job */
8732 if (j->type == REDIS_IOJOB_LOAD) {
8733 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8734 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8735 FILE *fp = fopen("/dev/null","w+");
8736 j->pages = rdbSavedObjectPages(j->val,fp);
8737 fclose(fp);
8738 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8739 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8740 j->canceled = 1;
8741 }
8742
8743 /* Done: insert the job into the processed queue */
8744 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8745 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8746 lockThreadedIO();
8747 listDelNode(server.io_processing,ln);
8748 listAddNodeTail(server.io_processed,j);
8749 unlockThreadedIO();
8750
8751 /* Signal the main thread there is new stuff to process */
8752 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8753 }
8754 return NULL; /* never reached */
8755 }
8756
8757 static void spawnIOThread(void) {
8758 pthread_t thread;
8759 sigset_t mask, omask;
8760
8761 sigemptyset(&mask);
8762 sigaddset(&mask,SIGCHLD);
8763 sigaddset(&mask,SIGHUP);
8764 sigaddset(&mask,SIGPIPE);
8765 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8766 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
8767 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8768 server.io_active_threads++;
8769 }
8770
8771 /* We need to wait for the last thread to exit before we are able to
8772 * fork() in order to BGSAVE or BGREWRITEAOF. */
8773 static void waitEmptyIOJobsQueue(void) {
8774 while(1) {
8775 int io_processed_len;
8776
8777 lockThreadedIO();
8778 if (listLength(server.io_newjobs) == 0 &&
8779 listLength(server.io_processing) == 0 &&
8780 server.io_active_threads == 0)
8781 {
8782 unlockThreadedIO();
8783 return;
8784 }
8785 /* While waiting for empty jobs queue condition we post-process some
8786 * finshed job, as I/O threads may be hanging trying to write against
8787 * the io_ready_pipe_write FD but there are so much pending jobs that
8788 * it's blocking. */
8789 io_processed_len = listLength(server.io_processed);
8790 unlockThreadedIO();
8791 if (io_processed_len) {
8792 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8793 usleep(1000); /* 1 millisecond */
8794 } else {
8795 usleep(10000); /* 10 milliseconds */
8796 }
8797 }
8798 }
8799
8800 static void vmReopenSwapFile(void) {
8801 /* Note: we don't close the old one as we are in the child process
8802 * and don't want to mess at all with the original file object. */
8803 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8804 if (server.vm_fp == NULL) {
8805 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8806 server.vm_swap_file);
8807 _exit(1);
8808 }
8809 server.vm_fd = fileno(server.vm_fp);
8810 }
8811
8812 /* This function must be called while with threaded IO locked */
8813 static void queueIOJob(iojob *j) {
8814 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8815 (void*)j, j->type, (char*)j->key->ptr);
8816 listAddNodeTail(server.io_newjobs,j);
8817 if (server.io_active_threads < server.vm_max_threads)
8818 spawnIOThread();
8819 }
8820
8821 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8822 iojob *j;
8823
8824 assert(key->storage == REDIS_VM_MEMORY);
8825 assert(key->refcount == 1);
8826
8827 j = zmalloc(sizeof(*j));
8828 j->type = REDIS_IOJOB_PREPARE_SWAP;
8829 j->db = db;
8830 j->key = dupStringObject(key);
8831 j->val = val;
8832 incrRefCount(val);
8833 j->canceled = 0;
8834 j->thread = (pthread_t) -1;
8835 key->storage = REDIS_VM_SWAPPING;
8836
8837 lockThreadedIO();
8838 queueIOJob(j);
8839 unlockThreadedIO();
8840 return REDIS_OK;
8841 }
8842
8843 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
8844
8845 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
8846 * If there is not already a job loading the key, it is craeted.
8847 * The key is added to the io_keys list in the client structure, and also
8848 * in the hash table mapping swapped keys to waiting clients, that is,
8849 * server.io_waited_keys. */
8850 static int waitForSwappedKey(redisClient *c, robj *key) {
8851 struct dictEntry *de;
8852 robj *o;
8853 list *l;
8854
8855 /* If the key does not exist or is already in RAM we don't need to
8856 * block the client at all. */
8857 de = dictFind(c->db->dict,key);
8858 if (de == NULL) return 0;
8859 o = dictGetEntryKey(de);
8860 if (o->storage == REDIS_VM_MEMORY) {
8861 return 0;
8862 } else if (o->storage == REDIS_VM_SWAPPING) {
8863 /* We were swapping the key, undo it! */
8864 vmCancelThreadedIOJob(o);
8865 return 0;
8866 }
8867
8868 /* OK: the key is either swapped, or being loaded just now. */
8869
8870 /* Add the key to the list of keys this client is waiting for.
8871 * This maps clients to keys they are waiting for. */
8872 listAddNodeTail(c->io_keys,key);
8873 incrRefCount(key);
8874
8875 /* Add the client to the swapped keys => clients waiting map. */
8876 de = dictFind(c->db->io_keys,key);
8877 if (de == NULL) {
8878 int retval;
8879
8880 /* For every key we take a list of clients blocked for it */
8881 l = listCreate();
8882 retval = dictAdd(c->db->io_keys,key,l);
8883 incrRefCount(key);
8884 assert(retval == DICT_OK);
8885 } else {
8886 l = dictGetEntryVal(de);
8887 }
8888 listAddNodeTail(l,c);
8889
8890 /* Are we already loading the key from disk? If not create a job */
8891 if (o->storage == REDIS_VM_SWAPPED) {
8892 iojob *j;
8893
8894 o->storage = REDIS_VM_LOADING;
8895 j = zmalloc(sizeof(*j));
8896 j->type = REDIS_IOJOB_LOAD;
8897 j->db = c->db;
8898 j->key = dupStringObject(key);
8899 j->key->vtype = o->vtype;
8900 j->page = o->vm.page;
8901 j->val = NULL;
8902 j->canceled = 0;
8903 j->thread = (pthread_t) -1;
8904 lockThreadedIO();
8905 queueIOJob(j);
8906 unlockThreadedIO();
8907 }
8908 return 1;
8909 }
8910
8911 /* Is this client attempting to run a command against swapped keys?
8912 * If so, block it ASAP, load the keys in background, then resume it.
8913 *
8914 * The important idea about this function is that it can fail! If keys will
8915 * still be swapped when the client is resumed, this key lookups will
8916 * just block loading keys from disk. In practical terms this should only
8917 * happen with SORT BY command or if there is a bug in this function.
8918 *
8919 * Return 1 if the client is marked as blocked, 0 if the client can
8920 * continue as the keys it is going to access appear to be in memory. */
8921 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8922 int j, last;
8923
8924 if (cmd->vm_firstkey == 0) return 0;
8925 last = cmd->vm_lastkey;
8926 if (last < 0) last = c->argc+last;
8927 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
8928 waitForSwappedKey(c,c->argv[j]);
8929 /* If the client was blocked for at least one key, mark it as blocked. */
8930 if (listLength(c->io_keys)) {
8931 c->flags |= REDIS_IO_WAIT;
8932 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8933 server.vm_blocked_clients++;
8934 return 1;
8935 } else {
8936 return 0;
8937 }
8938 }
8939
8940 /* Remove the 'key' from the list of blocked keys for a given client.
8941 *
8942 * The function returns 1 when there are no longer blocking keys after
8943 * the current one was removed (and the client can be unblocked). */
8944 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8945 list *l;
8946 listNode *ln;
8947 listIter li;
8948 struct dictEntry *de;
8949
8950 /* Remove the key from the list of keys this client is waiting for. */
8951 listRewind(c->io_keys,&li);
8952 while ((ln = listNext(&li)) != NULL) {
8953 if (compareStringObjects(ln->value,key) == 0) {
8954 listDelNode(c->io_keys,ln);
8955 break;
8956 }
8957 }
8958 assert(ln != NULL);
8959
8960 /* Remove the client form the key => waiting clients map. */
8961 de = dictFind(c->db->io_keys,key);
8962 assert(de != NULL);
8963 l = dictGetEntryVal(de);
8964 ln = listSearchKey(l,c);
8965 assert(ln != NULL);
8966 listDelNode(l,ln);
8967 if (listLength(l) == 0)
8968 dictDelete(c->db->io_keys,key);
8969
8970 return listLength(c->io_keys) == 0;
8971 }
8972
8973 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8974 struct dictEntry *de;
8975 list *l;
8976 listNode *ln;
8977 int len;
8978
8979 de = dictFind(db->io_keys,key);
8980 if (!de) return;
8981
8982 l = dictGetEntryVal(de);
8983 len = listLength(l);
8984 /* Note: we can't use something like while(listLength(l)) as the list
8985 * can be freed by the calling function when we remove the last element. */
8986 while (len--) {
8987 ln = listFirst(l);
8988 redisClient *c = ln->value;
8989
8990 if (dontWaitForSwappedKey(c,key)) {
8991 /* Put the client in the list of clients ready to go as we
8992 * loaded all the keys about it. */
8993 listAddNodeTail(server.io_ready_clients,c);
8994 }
8995 }
8996 }
8997
8998 /* ================================= Debugging ============================== */
8999
9000 static void debugCommand(redisClient *c) {
9001 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9002 *((char*)-1) = 'x';
9003 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9004 if (rdbSave(server.dbfilename) != REDIS_OK) {
9005 addReply(c,shared.err);
9006 return;
9007 }
9008 emptyDb();
9009 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9010 addReply(c,shared.err);
9011 return;
9012 }
9013 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9014 addReply(c,shared.ok);
9015 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9016 emptyDb();
9017 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9018 addReply(c,shared.err);
9019 return;
9020 }
9021 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9022 addReply(c,shared.ok);
9023 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9024 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9025 robj *key, *val;
9026
9027 if (!de) {
9028 addReply(c,shared.nokeyerr);
9029 return;
9030 }
9031 key = dictGetEntryKey(de);
9032 val = dictGetEntryVal(de);
9033 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9034 key->storage == REDIS_VM_SWAPPING)) {
9035 char *strenc;
9036 char buf[128];
9037
9038 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9039 strenc = strencoding[val->encoding];
9040 } else {
9041 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9042 strenc = buf;
9043 }
9044 addReplySds(c,sdscatprintf(sdsempty(),
9045 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9046 "encoding:%s serializedlength:%lld\r\n",
9047 (void*)key, key->refcount, (void*)val, val->refcount,
9048 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9049 } else {
9050 addReplySds(c,sdscatprintf(sdsempty(),
9051 "+Key at:%p refcount:%d, value swapped at: page %llu "
9052 "using %llu pages\r\n",
9053 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9054 (unsigned long long) key->vm.usedpages));
9055 }
9056 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9057 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9058 robj *key, *val;
9059
9060 if (!server.vm_enabled) {
9061 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9062 return;
9063 }
9064 if (!de) {
9065 addReply(c,shared.nokeyerr);
9066 return;
9067 }
9068 key = dictGetEntryKey(de);
9069 val = dictGetEntryVal(de);
9070 /* If the key is shared we want to create a copy */
9071 if (key->refcount > 1) {
9072 robj *newkey = dupStringObject(key);
9073 decrRefCount(key);
9074 key = dictGetEntryKey(de) = newkey;
9075 }
9076 /* Swap it */
9077 if (key->storage != REDIS_VM_MEMORY) {
9078 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9079 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9080 dictGetEntryVal(de) = NULL;
9081 addReply(c,shared.ok);
9082 } else {
9083 addReply(c,shared.err);
9084 }
9085 } else {
9086 addReplySds(c,sdsnew(
9087 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
9088 }
9089 }
9090
9091 static void _redisAssert(char *estr, char *file, int line) {
9092 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9093 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9094 #ifdef HAVE_BACKTRACE
9095 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9096 *((char*)-1) = 'x';
9097 #endif
9098 }
9099
9100 /* =================================== Main! ================================ */
9101
9102 #ifdef __linux__
9103 int linuxOvercommitMemoryValue(void) {
9104 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9105 char buf[64];
9106
9107 if (!fp) return -1;
9108 if (fgets(buf,64,fp) == NULL) {
9109 fclose(fp);
9110 return -1;
9111 }
9112 fclose(fp);
9113
9114 return atoi(buf);
9115 }
9116
9117 void linuxOvercommitMemoryWarning(void) {
9118 if (linuxOvercommitMemoryValue() == 0) {
9119 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9120 }
9121 }
9122 #endif /* __linux__ */
9123
9124 static void daemonize(void) {
9125 int fd;
9126 FILE *fp;
9127
9128 if (fork() != 0) exit(0); /* parent exits */
9129 setsid(); /* create a new session */
9130
9131 /* Every output goes to /dev/null. If Redis is daemonized but
9132 * the 'logfile' is set to 'stdout' in the configuration file
9133 * it will not log at all. */
9134 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9135 dup2(fd, STDIN_FILENO);
9136 dup2(fd, STDOUT_FILENO);
9137 dup2(fd, STDERR_FILENO);
9138 if (fd > STDERR_FILENO) close(fd);
9139 }
9140 /* Try to write the pid file */
9141 fp = fopen(server.pidfile,"w");
9142 if (fp) {
9143 fprintf(fp,"%d\n",getpid());
9144 fclose(fp);
9145 }
9146 }
9147
9148 int main(int argc, char **argv) {
9149 time_t start;
9150
9151 initServerConfig();
9152 if (argc == 2) {
9153 resetServerSaveParams();
9154 loadServerConfig(argv[1]);
9155 } else if (argc > 2) {
9156 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9157 exit(1);
9158 } else {
9159 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9160 }
9161 if (server.daemonize) daemonize();
9162 initServer();
9163 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9164 #ifdef __linux__
9165 linuxOvercommitMemoryWarning();
9166 #endif
9167 start = time(NULL);
9168 if (server.appendonly) {
9169 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9170 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9171 } else {
9172 if (rdbLoad(server.dbfilename) == REDIS_OK)
9173 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9174 }
9175 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9176 aeSetBeforeSleepProc(server.el,beforeSleep);
9177 aeMain(server.el);
9178 aeDeleteEventLoop(server.el);
9179 return 0;
9180 }
9181
9182 /* ============================= Backtrace support ========================= */
9183
9184 #ifdef HAVE_BACKTRACE
9185 static char *findFuncName(void *pointer, unsigned long *offset);
9186
9187 static void *getMcontextEip(ucontext_t *uc) {
9188 #if defined(__FreeBSD__)
9189 return (void*) uc->uc_mcontext.mc_eip;
9190 #elif defined(__dietlibc__)
9191 return (void*) uc->uc_mcontext.eip;
9192 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9193 #if __x86_64__
9194 return (void*) uc->uc_mcontext->__ss.__rip;
9195 #else
9196 return (void*) uc->uc_mcontext->__ss.__eip;
9197 #endif
9198 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9199 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9200 return (void*) uc->uc_mcontext->__ss.__rip;
9201 #else
9202 return (void*) uc->uc_mcontext->__ss.__eip;
9203 #endif
9204 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9205 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9206 #elif defined(__ia64__) /* Linux IA64 */
9207 return (void*) uc->uc_mcontext.sc_ip;
9208 #else
9209 return NULL;
9210 #endif
9211 }
9212
9213 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9214 void *trace[100];
9215 char **messages = NULL;
9216 int i, trace_size = 0;
9217 unsigned long offset=0;
9218 ucontext_t *uc = (ucontext_t*) secret;
9219 sds infostring;
9220 REDIS_NOTUSED(info);
9221
9222 redisLog(REDIS_WARNING,
9223 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9224 infostring = genRedisInfoString();
9225 redisLog(REDIS_WARNING, "%s",infostring);
9226 /* It's not safe to sdsfree() the returned string under memory
9227 * corruption conditions. Let it leak as we are going to abort */
9228
9229 trace_size = backtrace(trace, 100);
9230 /* overwrite sigaction with caller's address */
9231 if (getMcontextEip(uc) != NULL) {
9232 trace[1] = getMcontextEip(uc);
9233 }
9234 messages = backtrace_symbols(trace, trace_size);
9235
9236 for (i=1; i<trace_size; ++i) {
9237 char *fn = findFuncName(trace[i], &offset), *p;
9238
9239 p = strchr(messages[i],'+');
9240 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9241 redisLog(REDIS_WARNING,"%s", messages[i]);
9242 } else {
9243 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9244 }
9245 }
9246 /* free(messages); Don't call free() with possibly corrupted memory. */
9247 _exit(0);
9248 }
9249
9250 static void setupSigSegvAction(void) {
9251 struct sigaction act;
9252
9253 sigemptyset (&act.sa_mask);
9254 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9255 * is used. Otherwise, sa_handler is used */
9256 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9257 act.sa_sigaction = segvHandler;
9258 sigaction (SIGSEGV, &act, NULL);
9259 sigaction (SIGBUS, &act, NULL);
9260 sigaction (SIGFPE, &act, NULL);
9261 sigaction (SIGILL, &act, NULL);
9262 sigaction (SIGBUS, &act, NULL);
9263 return;
9264 }
9265
9266 #include "staticsymbols.h"
9267 /* This function try to convert a pointer into a function name. It's used in
9268 * oreder to provide a backtrace under segmentation fault that's able to
9269 * display functions declared as static (otherwise the backtrace is useless). */
9270 static char *findFuncName(void *pointer, unsigned long *offset){
9271 int i, ret = -1;
9272 unsigned long off, minoff = 0;
9273
9274 /* Try to match against the Symbol with the smallest offset */
9275 for (i=0; symsTable[i].pointer; i++) {
9276 unsigned long lp = (unsigned long) pointer;
9277
9278 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9279 off=lp-symsTable[i].pointer;
9280 if (ret < 0 || off < minoff) {
9281 minoff=off;
9282 ret=i;
9283 }
9284 }
9285 }
9286 if (ret == -1) return NULL;
9287 *offset = minoff;
9288 return symsTable[ret].name;
9289 }
9290 #else /* HAVE_BACKTRACE */
9291 static void setupSigSegvAction(void) {
9292 }
9293 #endif /* HAVE_BACKTRACE */
9294
9295
9296
9297 /* The End */
9298
9299
9300