]> git.saurik.com Git - redis.git/blob - redis.c
465398ada852a50c1d2fa1d3593bd634bed50439
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.10"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <signal.h>
41
42 #ifdef HAVE_BACKTRACE
43 #include <execinfo.h>
44 #include <ucontext.h>
45 #endif /* HAVE_BACKTRACE */
46
47 #include <sys/wait.h>
48 #include <errno.h>
49 #include <assert.h>
50 #include <ctype.h>
51 #include <stdarg.h>
52 #include <inttypes.h>
53 #include <arpa/inet.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #include <sys/time.h>
57 #include <sys/resource.h>
58 #include <sys/uio.h>
59 #include <limits.h>
60 #include <math.h>
61 #include <pthread.h>
62
63 #if defined(__sun)
64 #include "solarisfixes.h"
65 #endif
66
67 #include "redis.h"
68 #include "ae.h" /* Event driven programming library */
69 #include "sds.h" /* Dynamic safe strings */
70 #include "anet.h" /* Networking the easy way */
71 #include "dict.h" /* Hash tables */
72 #include "adlist.h" /* Linked lists */
73 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
74 #include "lzf.h" /* LZF compression library */
75 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
76 #include "zipmap.h"
77
78 /* Error codes */
79 #define REDIS_OK 0
80 #define REDIS_ERR -1
81
82 /* Static server configuration */
83 #define REDIS_SERVERPORT 6379 /* TCP port */
84 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
85 #define REDIS_IOBUF_LEN 1024
86 #define REDIS_LOADBUF_LEN 1024
87 #define REDIS_STATIC_ARGS 8
88 #define REDIS_DEFAULT_DBNUM 16
89 #define REDIS_CONFIGLINE_MAX 1024
90 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
91 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
92 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
93 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
94 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
95
96 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
97 #define REDIS_WRITEV_THRESHOLD 3
98 /* Max number of iovecs used for each writev call */
99 #define REDIS_WRITEV_IOVEC_COUNT 256
100
101 /* Hash table parameters */
102 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
103
104 /* Command flags */
105 #define REDIS_CMD_BULK 1 /* Bulk write command */
106 #define REDIS_CMD_INLINE 2 /* Inline command */
107 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
108 this flags will return an error when the 'maxmemory' option is set in the
109 config file and the server is using more than maxmemory bytes of memory.
110 In short this commands are denied on low memory conditions. */
111 #define REDIS_CMD_DENYOOM 4
112 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
113
114 /* Object types */
115 #define REDIS_STRING 0
116 #define REDIS_LIST 1
117 #define REDIS_SET 2
118 #define REDIS_ZSET 3
119 #define REDIS_HASH 4
120
121 /* Objects encoding. Some kind of objects like Strings and Hashes can be
122 * internally represented in multiple ways. The 'encoding' field of the object
123 * is set to one of this fields for this object. */
124 #define REDIS_ENCODING_RAW 0 /* Raw representation */
125 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
126 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
127 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
128
129 static char* strencoding[] = {
130 "raw", "int", "zipmap", "hashtable"
131 };
132
133 /* Object types only used for dumping to disk */
134 #define REDIS_EXPIRETIME 253
135 #define REDIS_SELECTDB 254
136 #define REDIS_EOF 255
137
138 /* Defines related to the dump file format. To store 32 bits lengths for short
139 * keys requires a lot of space, so we check the most significant 2 bits of
140 * the first byte to interpreter the length:
141 *
142 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
143 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
144 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
145 * 11|000000 this means: specially encoded object will follow. The six bits
146 * number specify the kind of object that follows.
147 * See the REDIS_RDB_ENC_* defines.
148 *
149 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
150 * values, will fit inside. */
151 #define REDIS_RDB_6BITLEN 0
152 #define REDIS_RDB_14BITLEN 1
153 #define REDIS_RDB_32BITLEN 2
154 #define REDIS_RDB_ENCVAL 3
155 #define REDIS_RDB_LENERR UINT_MAX
156
157 /* When a length of a string object stored on disk has the first two bits
158 * set, the remaining two bits specify a special encoding for the object
159 * accordingly to the following defines: */
160 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
161 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
162 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
163 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
164
165 /* Virtual memory object->where field. */
166 #define REDIS_VM_MEMORY 0 /* The object is on memory */
167 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
168 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
169 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
170
171 /* Virtual memory static configuration stuff.
172 * Check vmFindContiguousPages() to know more about this magic numbers. */
173 #define REDIS_VM_MAX_NEAR_PAGES 65536
174 #define REDIS_VM_MAX_RANDOM_JUMP 4096
175 #define REDIS_VM_MAX_THREADS 32
176 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
177 /* The following is the *percentage* of completed I/O jobs to process when the
178 * handelr is called. While Virtual Memory I/O operations are performed by
179 * threads, this operations must be processed by the main thread when completed
180 * in order to take effect. */
181 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
182
183 /* Client flags */
184 #define REDIS_SLAVE 1 /* This client is a slave server */
185 #define REDIS_MASTER 2 /* This client is a master server */
186 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
187 #define REDIS_MULTI 8 /* This client is in a MULTI context */
188 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
189 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
190
191 /* Slave replication state - slave side */
192 #define REDIS_REPL_NONE 0 /* No active replication */
193 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
194 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
195
196 /* Slave replication state - from the point of view of master
197 * Note that in SEND_BULK and ONLINE state the slave receives new updates
198 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
199 * to start the next background saving in order to send updates to it. */
200 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
201 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
202 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
203 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
204
205 /* List related stuff */
206 #define REDIS_HEAD 0
207 #define REDIS_TAIL 1
208
209 /* Sort operations */
210 #define REDIS_SORT_GET 0
211 #define REDIS_SORT_ASC 1
212 #define REDIS_SORT_DESC 2
213 #define REDIS_SORTKEY_MAX 1024
214
215 /* Log levels */
216 #define REDIS_DEBUG 0
217 #define REDIS_VERBOSE 1
218 #define REDIS_NOTICE 2
219 #define REDIS_WARNING 3
220
221 /* Anti-warning macro... */
222 #define REDIS_NOTUSED(V) ((void) V)
223
224 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
225 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
226
227 /* Append only defines */
228 #define APPENDFSYNC_NO 0
229 #define APPENDFSYNC_ALWAYS 1
230 #define APPENDFSYNC_EVERYSEC 2
231
232 /* Hashes related defaults */
233 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
234 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
235
236 /* We can print the stacktrace, so our assert is defined this way: */
237 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
238 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
239 static void _redisAssert(char *estr, char *file, int line);
240 static void _redisPanic(char *msg, char *file, int line);
241
242 /*================================= Data types ============================== */
243
244 /* A redis object, that is a type able to hold a string / list / set */
245
246 /* The VM object structure */
247 struct redisObjectVM {
248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
251 } vm;
252
253 /* The actual Redis Object */
254 typedef struct redisObject {
255 void *ptr;
256 unsigned char type;
257 unsigned char encoding;
258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
262 int refcount;
263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
268 } robj;
269
270 /* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274 #define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 } while(0);
281
282 typedef struct redisDb {
283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
286 dict *io_keys; /* Keys with clients waiting for VM I/O */
287 int id;
288 } redisDb;
289
290 /* Client MULTI/EXEC state */
291 typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295 } multiCmd;
296
297 typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300 } multiState;
301
302 /* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304 typedef struct redisClient {
305 int fd;
306 redisDb *db;
307 int dictid;
308 sds querybuf;
309 robj **argv, **mbargv;
310 int argc, mbargc;
311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
312 int multibulk; /* multi bulk command format active */
313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
321 long repldboff; /* replication DB file offset */
322 off_t repldbsize; /* replication DB file size */
323 multiState mstate; /* MULTI/EXEC state */
324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
325 * operation such as BLPOP. Otherwise NULL. */
326 int blockingkeysnum; /* Number of blocking keys */
327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
333 } redisClient;
334
335 struct saveparam {
336 time_t seconds;
337 int changes;
338 };
339
340 /* Global server state structure */
341 struct redisServer {
342 int port;
343 int fd;
344 redisDb *db;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int rdbcompression;
381 int activerehashing;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
442 /* Misc */
443 FILE *devnull;
444 };
445
446 typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449 } pubsubPattern;
450
451 typedef void redisCommandProc(redisClient *c);
452 struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
465 };
466
467 struct redisFunctionSym {
468 char *name;
469 unsigned long pointer;
470 };
471
472 typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478 } redisSortObject;
479
480 typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483 } redisSortOperation;
484
485 /* ZSETs use a specialized version of Skiplists */
486
487 typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
489 struct zskiplistNode *backward;
490 unsigned int *span;
491 double score;
492 robj *obj;
493 } zskiplistNode;
494
495 typedef struct zskiplist {
496 struct zskiplistNode *header, *tail;
497 unsigned long length;
498 int level;
499 } zskiplist;
500
501 typedef struct zset {
502 dict *dict;
503 zskiplist *zsl;
504 } zset;
505
506 /* Our shared "common" objects */
507
508 #define REDIS_SHARED_INTEGERS 10000
509 struct sharedObjectsStruct {
510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
511 *colon, *nullbulk, *nullmultibulk, *queued,
512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
514 *select0, *select1, *select2, *select3, *select4,
515 *select5, *select6, *select7, *select8, *select9,
516 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
517 *mbulk4, *psubscribebulk, *punsubscribebulk,
518 *integers[REDIS_SHARED_INTEGERS];
519 } shared;
520
521 /* Global vars that are actally used as constants. The following double
522 * values are used for double on-disk serialization, and are initialized
523 * at runtime to avoid strange compiler optimizations. */
524
525 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
526
527 /* VM threaded I/O request message */
528 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
529 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
530 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
531 typedef struct iojob {
532 int type; /* Request type, REDIS_IOJOB_* */
533 redisDb *db;/* Redis database */
534 robj *key; /* This I/O request is about swapping this key */
535 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
536 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
537 off_t page; /* Swap page where to read/write the object */
538 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
539 int canceled; /* True if this command was canceled by blocking side of VM */
540 pthread_t thread; /* ID of the thread processing this entry */
541 } iojob;
542
543 /*================================ Prototypes =============================== */
544
545 static void freeStringObject(robj *o);
546 static void freeListObject(robj *o);
547 static void freeSetObject(robj *o);
548 static void decrRefCount(void *o);
549 static robj *createObject(int type, void *ptr);
550 static void freeClient(redisClient *c);
551 static int rdbLoad(char *filename);
552 static void addReply(redisClient *c, robj *obj);
553 static void addReplySds(redisClient *c, sds s);
554 static void incrRefCount(robj *o);
555 static int rdbSaveBackground(char *filename);
556 static robj *createStringObject(char *ptr, size_t len);
557 static robj *dupStringObject(robj *o);
558 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
559 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
560 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
561 static int syncWithMaster(void);
562 static robj *tryObjectEncoding(robj *o);
563 static robj *getDecodedObject(robj *o);
564 static int removeExpire(redisDb *db, robj *key);
565 static int expireIfNeeded(redisDb *db, robj *key);
566 static int deleteIfVolatile(redisDb *db, robj *key);
567 static int deleteIfSwapped(redisDb *db, robj *key);
568 static int deleteKey(redisDb *db, robj *key);
569 static time_t getExpire(redisDb *db, robj *key);
570 static int setExpire(redisDb *db, robj *key, time_t when);
571 static void updateSlavesWaitingBgsave(int bgsaveerr);
572 static void freeMemoryIfNeeded(void);
573 static int processCommand(redisClient *c);
574 static void setupSigSegvAction(void);
575 static void rdbRemoveTempFile(pid_t childpid);
576 static void aofRemoveTempFile(pid_t childpid);
577 static size_t stringObjectLen(robj *o);
578 static void processInputBuffer(redisClient *c);
579 static zskiplist *zslCreate(void);
580 static void zslFree(zskiplist *zsl);
581 static void zslInsert(zskiplist *zsl, double score, robj *obj);
582 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
583 static void initClientMultiState(redisClient *c);
584 static void freeClientMultiState(redisClient *c);
585 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
586 static void unblockClientWaitingData(redisClient *c);
587 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
588 static void vmInit(void);
589 static void vmMarkPagesFree(off_t page, off_t count);
590 static robj *vmLoadObject(robj *key);
591 static robj *vmPreviewObject(robj *key);
592 static int vmSwapOneObjectBlocking(void);
593 static int vmSwapOneObjectThreaded(void);
594 static int vmCanSwapOut(void);
595 static int tryFreeOneObjectFromFreelist(void);
596 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
597 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
598 static void vmCancelThreadedIOJob(robj *o);
599 static void lockThreadedIO(void);
600 static void unlockThreadedIO(void);
601 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
602 static void freeIOJob(iojob *j);
603 static void queueIOJob(iojob *j);
604 static int vmWriteObjectOnSwap(robj *o, off_t page);
605 static robj *vmReadObjectFromSwap(off_t page, int type);
606 static void waitEmptyIOJobsQueue(void);
607 static void vmReopenSwapFile(void);
608 static int vmFreePage(off_t page);
609 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
610 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
611 static int dontWaitForSwappedKey(redisClient *c, robj *key);
612 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
613 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
614 static struct redisCommand *lookupCommand(char *name);
615 static void call(redisClient *c, struct redisCommand *cmd);
616 static void resetClient(redisClient *c);
617 static void convertToRealHash(robj *o);
618 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
619 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
620 static void freePubsubPattern(void *p);
621 static int listMatchPubsubPattern(void *a, void *b);
622 static int compareStringObjects(robj *a, robj *b);
623 static void usage();
624 static int rewriteAppendOnlyFileBackground(void);
625
626 static void authCommand(redisClient *c);
627 static void pingCommand(redisClient *c);
628 static void echoCommand(redisClient *c);
629 static void setCommand(redisClient *c);
630 static void setnxCommand(redisClient *c);
631 static void setexCommand(redisClient *c);
632 static void getCommand(redisClient *c);
633 static void delCommand(redisClient *c);
634 static void existsCommand(redisClient *c);
635 static void incrCommand(redisClient *c);
636 static void decrCommand(redisClient *c);
637 static void incrbyCommand(redisClient *c);
638 static void decrbyCommand(redisClient *c);
639 static void selectCommand(redisClient *c);
640 static void randomkeyCommand(redisClient *c);
641 static void keysCommand(redisClient *c);
642 static void dbsizeCommand(redisClient *c);
643 static void lastsaveCommand(redisClient *c);
644 static void saveCommand(redisClient *c);
645 static void bgsaveCommand(redisClient *c);
646 static void bgrewriteaofCommand(redisClient *c);
647 static void shutdownCommand(redisClient *c);
648 static void moveCommand(redisClient *c);
649 static void renameCommand(redisClient *c);
650 static void renamenxCommand(redisClient *c);
651 static void lpushCommand(redisClient *c);
652 static void rpushCommand(redisClient *c);
653 static void lpopCommand(redisClient *c);
654 static void rpopCommand(redisClient *c);
655 static void llenCommand(redisClient *c);
656 static void lindexCommand(redisClient *c);
657 static void lrangeCommand(redisClient *c);
658 static void ltrimCommand(redisClient *c);
659 static void typeCommand(redisClient *c);
660 static void lsetCommand(redisClient *c);
661 static void saddCommand(redisClient *c);
662 static void sremCommand(redisClient *c);
663 static void smoveCommand(redisClient *c);
664 static void sismemberCommand(redisClient *c);
665 static void scardCommand(redisClient *c);
666 static void spopCommand(redisClient *c);
667 static void srandmemberCommand(redisClient *c);
668 static void sinterCommand(redisClient *c);
669 static void sinterstoreCommand(redisClient *c);
670 static void sunionCommand(redisClient *c);
671 static void sunionstoreCommand(redisClient *c);
672 static void sdiffCommand(redisClient *c);
673 static void sdiffstoreCommand(redisClient *c);
674 static void syncCommand(redisClient *c);
675 static void flushdbCommand(redisClient *c);
676 static void flushallCommand(redisClient *c);
677 static void sortCommand(redisClient *c);
678 static void lremCommand(redisClient *c);
679 static void rpoplpushcommand(redisClient *c);
680 static void infoCommand(redisClient *c);
681 static void mgetCommand(redisClient *c);
682 static void monitorCommand(redisClient *c);
683 static void expireCommand(redisClient *c);
684 static void expireatCommand(redisClient *c);
685 static void getsetCommand(redisClient *c);
686 static void ttlCommand(redisClient *c);
687 static void slaveofCommand(redisClient *c);
688 static void debugCommand(redisClient *c);
689 static void msetCommand(redisClient *c);
690 static void msetnxCommand(redisClient *c);
691 static void zaddCommand(redisClient *c);
692 static void zincrbyCommand(redisClient *c);
693 static void zrangeCommand(redisClient *c);
694 static void zrangebyscoreCommand(redisClient *c);
695 static void zcountCommand(redisClient *c);
696 static void zrevrangeCommand(redisClient *c);
697 static void zcardCommand(redisClient *c);
698 static void zremCommand(redisClient *c);
699 static void zscoreCommand(redisClient *c);
700 static void zremrangebyscoreCommand(redisClient *c);
701 static void multiCommand(redisClient *c);
702 static void execCommand(redisClient *c);
703 static void discardCommand(redisClient *c);
704 static void blpopCommand(redisClient *c);
705 static void brpopCommand(redisClient *c);
706 static void appendCommand(redisClient *c);
707 static void substrCommand(redisClient *c);
708 static void zrankCommand(redisClient *c);
709 static void zrevrankCommand(redisClient *c);
710 static void hsetCommand(redisClient *c);
711 static void hsetnxCommand(redisClient *c);
712 static void hgetCommand(redisClient *c);
713 static void hmsetCommand(redisClient *c);
714 static void hmgetCommand(redisClient *c);
715 static void hdelCommand(redisClient *c);
716 static void hlenCommand(redisClient *c);
717 static void zremrangebyrankCommand(redisClient *c);
718 static void zunionCommand(redisClient *c);
719 static void zinterCommand(redisClient *c);
720 static void hkeysCommand(redisClient *c);
721 static void hvalsCommand(redisClient *c);
722 static void hgetallCommand(redisClient *c);
723 static void hexistsCommand(redisClient *c);
724 static void configCommand(redisClient *c);
725 static void hincrbyCommand(redisClient *c);
726 static void subscribeCommand(redisClient *c);
727 static void unsubscribeCommand(redisClient *c);
728 static void psubscribeCommand(redisClient *c);
729 static void punsubscribeCommand(redisClient *c);
730 static void publishCommand(redisClient *c);
731
732 /*================================= Globals ================================= */
733
734 /* Global vars */
735 static struct redisServer server; /* server global state */
736 static struct redisCommand cmdTable[] = {
737 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
738 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
739 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
740 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
741 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
742 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
744 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
747 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
748 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
749 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
750 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
757 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
760 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
761 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
762 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
763 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
764 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
765 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
768 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
769 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
770 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
771 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
772 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
773 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
774 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
776 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
777 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
778 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
781 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
782 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
788 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
789 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
791 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
793 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
795 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
802 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
803 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
804 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
805 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
806 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
807 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
819 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
827 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
832 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
838 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
843 {NULL,NULL,0,0,NULL,0,0,0}
844 };
845
846 /*============================ Utility functions ============================ */
847
848 /* Glob-style pattern matching. */
849 static int stringmatchlen(const char *pattern, int patternLen,
850 const char *string, int stringLen, int nocase)
851 {
852 while(patternLen) {
853 switch(pattern[0]) {
854 case '*':
855 while (pattern[1] == '*') {
856 pattern++;
857 patternLen--;
858 }
859 if (patternLen == 1)
860 return 1; /* match */
861 while(stringLen) {
862 if (stringmatchlen(pattern+1, patternLen-1,
863 string, stringLen, nocase))
864 return 1; /* match */
865 string++;
866 stringLen--;
867 }
868 return 0; /* no match */
869 break;
870 case '?':
871 if (stringLen == 0)
872 return 0; /* no match */
873 string++;
874 stringLen--;
875 break;
876 case '[':
877 {
878 int not, match;
879
880 pattern++;
881 patternLen--;
882 not = pattern[0] == '^';
883 if (not) {
884 pattern++;
885 patternLen--;
886 }
887 match = 0;
888 while(1) {
889 if (pattern[0] == '\\') {
890 pattern++;
891 patternLen--;
892 if (pattern[0] == string[0])
893 match = 1;
894 } else if (pattern[0] == ']') {
895 break;
896 } else if (patternLen == 0) {
897 pattern--;
898 patternLen++;
899 break;
900 } else if (pattern[1] == '-' && patternLen >= 3) {
901 int start = pattern[0];
902 int end = pattern[2];
903 int c = string[0];
904 if (start > end) {
905 int t = start;
906 start = end;
907 end = t;
908 }
909 if (nocase) {
910 start = tolower(start);
911 end = tolower(end);
912 c = tolower(c);
913 }
914 pattern += 2;
915 patternLen -= 2;
916 if (c >= start && c <= end)
917 match = 1;
918 } else {
919 if (!nocase) {
920 if (pattern[0] == string[0])
921 match = 1;
922 } else {
923 if (tolower((int)pattern[0]) == tolower((int)string[0]))
924 match = 1;
925 }
926 }
927 pattern++;
928 patternLen--;
929 }
930 if (not)
931 match = !match;
932 if (!match)
933 return 0; /* no match */
934 string++;
935 stringLen--;
936 break;
937 }
938 case '\\':
939 if (patternLen >= 2) {
940 pattern++;
941 patternLen--;
942 }
943 /* fall through */
944 default:
945 if (!nocase) {
946 if (pattern[0] != string[0])
947 return 0; /* no match */
948 } else {
949 if (tolower((int)pattern[0]) != tolower((int)string[0]))
950 return 0; /* no match */
951 }
952 string++;
953 stringLen--;
954 break;
955 }
956 pattern++;
957 patternLen--;
958 if (stringLen == 0) {
959 while(*pattern == '*') {
960 pattern++;
961 patternLen--;
962 }
963 break;
964 }
965 }
966 if (patternLen == 0 && stringLen == 0)
967 return 1;
968 return 0;
969 }
970
971 static int stringmatch(const char *pattern, const char *string, int nocase) {
972 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
973 }
974
975 /* Convert a string representing an amount of memory into the number of
976 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
977 * (1024*1024*1024).
978 *
979 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
980 * set to 0 */
981 static long long memtoll(const char *p, int *err) {
982 const char *u;
983 char buf[128];
984 long mul; /* unit multiplier */
985 long long val;
986 unsigned int digits;
987
988 if (err) *err = 0;
989 /* Search the first non digit character. */
990 u = p;
991 if (*u == '-') u++;
992 while(*u && isdigit(*u)) u++;
993 if (*u == '\0' || !strcasecmp(u,"b")) {
994 mul = 1;
995 } else if (!strcasecmp(u,"k")) {
996 mul = 1000;
997 } else if (!strcasecmp(u,"kb")) {
998 mul = 1024;
999 } else if (!strcasecmp(u,"m")) {
1000 mul = 1000*1000;
1001 } else if (!strcasecmp(u,"mb")) {
1002 mul = 1024*1024;
1003 } else if (!strcasecmp(u,"g")) {
1004 mul = 1000L*1000*1000;
1005 } else if (!strcasecmp(u,"gb")) {
1006 mul = 1024L*1024*1024;
1007 } else {
1008 if (err) *err = 1;
1009 mul = 1;
1010 }
1011 digits = u-p;
1012 if (digits >= sizeof(buf)) {
1013 if (err) *err = 1;
1014 return LLONG_MAX;
1015 }
1016 memcpy(buf,p,digits);
1017 buf[digits] = '\0';
1018 val = strtoll(buf,NULL,10);
1019 return val*mul;
1020 }
1021
1022 static void redisLog(int level, const char *fmt, ...) {
1023 va_list ap;
1024 FILE *fp;
1025
1026 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1027 if (!fp) return;
1028
1029 va_start(ap, fmt);
1030 if (level >= server.verbosity) {
1031 char *c = ".-*#";
1032 char buf[64];
1033 time_t now;
1034
1035 now = time(NULL);
1036 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1037 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1038 vfprintf(fp, fmt, ap);
1039 fprintf(fp,"\n");
1040 fflush(fp);
1041 }
1042 va_end(ap);
1043
1044 if (server.logfile) fclose(fp);
1045 }
1046
1047 /*====================== Hash table type implementation ==================== */
1048
1049 /* This is an hash table type that uses the SDS dynamic strings libary as
1050 * keys and radis objects as values (objects can hold SDS strings,
1051 * lists, sets). */
1052
1053 static void dictVanillaFree(void *privdata, void *val)
1054 {
1055 DICT_NOTUSED(privdata);
1056 zfree(val);
1057 }
1058
1059 static void dictListDestructor(void *privdata, void *val)
1060 {
1061 DICT_NOTUSED(privdata);
1062 listRelease((list*)val);
1063 }
1064
1065 static int sdsDictKeyCompare(void *privdata, const void *key1,
1066 const void *key2)
1067 {
1068 int l1,l2;
1069 DICT_NOTUSED(privdata);
1070
1071 l1 = sdslen((sds)key1);
1072 l2 = sdslen((sds)key2);
1073 if (l1 != l2) return 0;
1074 return memcmp(key1, key2, l1) == 0;
1075 }
1076
1077 static void dictRedisObjectDestructor(void *privdata, void *val)
1078 {
1079 DICT_NOTUSED(privdata);
1080
1081 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1082 decrRefCount(val);
1083 }
1084
1085 static int dictObjKeyCompare(void *privdata, const void *key1,
1086 const void *key2)
1087 {
1088 const robj *o1 = key1, *o2 = key2;
1089 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1090 }
1091
1092 static unsigned int dictObjHash(const void *key) {
1093 const robj *o = key;
1094 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1095 }
1096
1097 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1098 const void *key2)
1099 {
1100 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1101 int cmp;
1102
1103 if (o1->encoding == REDIS_ENCODING_INT &&
1104 o2->encoding == REDIS_ENCODING_INT &&
1105 o1->ptr == o2->ptr) return 1;
1106
1107 o1 = getDecodedObject(o1);
1108 o2 = getDecodedObject(o2);
1109 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1110 decrRefCount(o1);
1111 decrRefCount(o2);
1112 return cmp;
1113 }
1114
1115 static unsigned int dictEncObjHash(const void *key) {
1116 robj *o = (robj*) key;
1117
1118 if (o->encoding == REDIS_ENCODING_RAW) {
1119 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1120 } else {
1121 if (o->encoding == REDIS_ENCODING_INT) {
1122 char buf[32];
1123 int len;
1124
1125 len = snprintf(buf,32,"%ld",(long)o->ptr);
1126 return dictGenHashFunction((unsigned char*)buf, len);
1127 } else {
1128 unsigned int hash;
1129
1130 o = getDecodedObject(o);
1131 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1132 decrRefCount(o);
1133 return hash;
1134 }
1135 }
1136 }
1137
1138 /* Sets type and expires */
1139 static dictType setDictType = {
1140 dictEncObjHash, /* hash function */
1141 NULL, /* key dup */
1142 NULL, /* val dup */
1143 dictEncObjKeyCompare, /* key compare */
1144 dictRedisObjectDestructor, /* key destructor */
1145 NULL /* val destructor */
1146 };
1147
1148 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1149 static dictType zsetDictType = {
1150 dictEncObjHash, /* hash function */
1151 NULL, /* key dup */
1152 NULL, /* val dup */
1153 dictEncObjKeyCompare, /* key compare */
1154 dictRedisObjectDestructor, /* key destructor */
1155 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1156 };
1157
1158 /* Db->dict */
1159 static dictType dbDictType = {
1160 dictObjHash, /* hash function */
1161 NULL, /* key dup */
1162 NULL, /* val dup */
1163 dictObjKeyCompare, /* key compare */
1164 dictRedisObjectDestructor, /* key destructor */
1165 dictRedisObjectDestructor /* val destructor */
1166 };
1167
1168 /* Db->expires */
1169 static dictType keyptrDictType = {
1170 dictObjHash, /* hash function */
1171 NULL, /* key dup */
1172 NULL, /* val dup */
1173 dictObjKeyCompare, /* key compare */
1174 dictRedisObjectDestructor, /* key destructor */
1175 NULL /* val destructor */
1176 };
1177
1178 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1179 static dictType hashDictType = {
1180 dictEncObjHash, /* hash function */
1181 NULL, /* key dup */
1182 NULL, /* val dup */
1183 dictEncObjKeyCompare, /* key compare */
1184 dictRedisObjectDestructor, /* key destructor */
1185 dictRedisObjectDestructor /* val destructor */
1186 };
1187
1188 /* Keylist hash table type has unencoded redis objects as keys and
1189 * lists as values. It's used for blocking operations (BLPOP) and to
1190 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1191 static dictType keylistDictType = {
1192 dictObjHash, /* hash function */
1193 NULL, /* key dup */
1194 NULL, /* val dup */
1195 dictObjKeyCompare, /* key compare */
1196 dictRedisObjectDestructor, /* key destructor */
1197 dictListDestructor /* val destructor */
1198 };
1199
1200 static void version();
1201
1202 /* ========================= Random utility functions ======================= */
1203
1204 /* Redis generally does not try to recover from out of memory conditions
1205 * when allocating objects or strings, it is not clear if it will be possible
1206 * to report this condition to the client since the networking layer itself
1207 * is based on heap allocation for send buffers, so we simply abort.
1208 * At least the code will be simpler to read... */
1209 static void oom(const char *msg) {
1210 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1211 sleep(1);
1212 abort();
1213 }
1214
1215 /* ====================== Redis server networking stuff ===================== */
1216 static void closeTimedoutClients(void) {
1217 redisClient *c;
1218 listNode *ln;
1219 time_t now = time(NULL);
1220 listIter li;
1221
1222 listRewind(server.clients,&li);
1223 while ((ln = listNext(&li)) != NULL) {
1224 c = listNodeValue(ln);
1225 if (server.maxidletime &&
1226 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1227 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1228 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1229 listLength(c->pubsub_patterns) == 0 &&
1230 (now - c->lastinteraction > server.maxidletime))
1231 {
1232 redisLog(REDIS_VERBOSE,"Closing idle client");
1233 freeClient(c);
1234 } else if (c->flags & REDIS_BLOCKED) {
1235 if (c->blockingto != 0 && c->blockingto < now) {
1236 addReply(c,shared.nullmultibulk);
1237 unblockClientWaitingData(c);
1238 }
1239 }
1240 }
1241 }
1242
1243 static int htNeedsResize(dict *dict) {
1244 long long size, used;
1245
1246 size = dictSlots(dict);
1247 used = dictSize(dict);
1248 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1249 (used*100/size < REDIS_HT_MINFILL));
1250 }
1251
1252 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1253 * we resize the hash table to save memory */
1254 static void tryResizeHashTables(void) {
1255 int j;
1256
1257 for (j = 0; j < server.dbnum; j++) {
1258 if (htNeedsResize(server.db[j].dict))
1259 dictResize(server.db[j].dict);
1260 if (htNeedsResize(server.db[j].expires))
1261 dictResize(server.db[j].expires);
1262 }
1263 }
1264
1265 /* Our hash table implementation performs rehashing incrementally while
1266 * we write/read from the hash table. Still if the server is idle, the hash
1267 * table will use two tables for a long time. So we try to use 1 millisecond
1268 * of CPU time at every serverCron() loop in order to rehash some key. */
1269 static void incrementallyRehash(void) {
1270 int j;
1271
1272 for (j = 0; j < server.dbnum; j++) {
1273 if (dictIsRehashing(server.db[j].dict)) {
1274 dictRehashMilliseconds(server.db[j].dict,1);
1275 break; /* already used our millisecond for this loop... */
1276 }
1277 }
1278 }
1279
1280 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1281 void backgroundSaveDoneHandler(int statloc) {
1282 int exitcode = WEXITSTATUS(statloc);
1283 int bysignal = WIFSIGNALED(statloc);
1284
1285 if (!bysignal && exitcode == 0) {
1286 redisLog(REDIS_NOTICE,
1287 "Background saving terminated with success");
1288 server.dirty = 0;
1289 server.lastsave = time(NULL);
1290 } else if (!bysignal && exitcode != 0) {
1291 redisLog(REDIS_WARNING, "Background saving error");
1292 } else {
1293 redisLog(REDIS_WARNING,
1294 "Background saving terminated by signal %d", WTERMSIG(statloc));
1295 rdbRemoveTempFile(server.bgsavechildpid);
1296 }
1297 server.bgsavechildpid = -1;
1298 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1299 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1300 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1301 }
1302
1303 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1304 * Handle this. */
1305 void backgroundRewriteDoneHandler(int statloc) {
1306 int exitcode = WEXITSTATUS(statloc);
1307 int bysignal = WIFSIGNALED(statloc);
1308
1309 if (!bysignal && exitcode == 0) {
1310 int fd;
1311 char tmpfile[256];
1312
1313 redisLog(REDIS_NOTICE,
1314 "Background append only file rewriting terminated with success");
1315 /* Now it's time to flush the differences accumulated by the parent */
1316 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1317 fd = open(tmpfile,O_WRONLY|O_APPEND);
1318 if (fd == -1) {
1319 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1320 goto cleanup;
1321 }
1322 /* Flush our data... */
1323 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1324 (signed) sdslen(server.bgrewritebuf)) {
1325 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1326 close(fd);
1327 goto cleanup;
1328 }
1329 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1330 /* Now our work is to rename the temp file into the stable file. And
1331 * switch the file descriptor used by the server for append only. */
1332 if (rename(tmpfile,server.appendfilename) == -1) {
1333 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1334 close(fd);
1335 goto cleanup;
1336 }
1337 /* Mission completed... almost */
1338 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1339 if (server.appendfd != -1) {
1340 /* If append only is actually enabled... */
1341 close(server.appendfd);
1342 server.appendfd = fd;
1343 fsync(fd);
1344 server.appendseldb = -1; /* Make sure it will issue SELECT */
1345 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1346 } else {
1347 /* If append only is disabled we just generate a dump in this
1348 * format. Why not? */
1349 close(fd);
1350 }
1351 } else if (!bysignal && exitcode != 0) {
1352 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1353 } else {
1354 redisLog(REDIS_WARNING,
1355 "Background append only file rewriting terminated by signal %d",
1356 WTERMSIG(statloc));
1357 }
1358 cleanup:
1359 sdsfree(server.bgrewritebuf);
1360 server.bgrewritebuf = sdsempty();
1361 aofRemoveTempFile(server.bgrewritechildpid);
1362 server.bgrewritechildpid = -1;
1363 }
1364
1365 /* This function is called once a background process of some kind terminates,
1366 * as we want to avoid resizing the hash tables when there is a child in order
1367 * to play well with copy-on-write (otherwise when a resize happens lots of
1368 * memory pages are copied). The goal of this function is to update the ability
1369 * for dict.c to resize the hash tables accordingly to the fact we have o not
1370 * running childs. */
1371 static void updateDictResizePolicy(void) {
1372 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1373 dictEnableResize();
1374 else
1375 dictDisableResize();
1376 }
1377
1378 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1379 int j, loops = server.cronloops++;
1380 REDIS_NOTUSED(eventLoop);
1381 REDIS_NOTUSED(id);
1382 REDIS_NOTUSED(clientData);
1383
1384 /* We take a cached value of the unix time in the global state because
1385 * with virtual memory and aging there is to store the current time
1386 * in objects at every object access, and accuracy is not needed.
1387 * To access a global var is faster than calling time(NULL) */
1388 server.unixtime = time(NULL);
1389
1390 /* Show some info about non-empty databases */
1391 for (j = 0; j < server.dbnum; j++) {
1392 long long size, used, vkeys;
1393
1394 size = dictSlots(server.db[j].dict);
1395 used = dictSize(server.db[j].dict);
1396 vkeys = dictSize(server.db[j].expires);
1397 if (!(loops % 50) && (used || vkeys)) {
1398 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1399 /* dictPrintStats(server.dict); */
1400 }
1401 }
1402
1403 /* We don't want to resize the hash tables while a bacground saving
1404 * is in progress: the saving child is created using fork() that is
1405 * implemented with a copy-on-write semantic in most modern systems, so
1406 * if we resize the HT while there is the saving child at work actually
1407 * a lot of memory movements in the parent will cause a lot of pages
1408 * copied. */
1409 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1410 if (!(loops % 10)) tryResizeHashTables();
1411 if (server.activerehashing) incrementallyRehash();
1412 }
1413
1414 /* Show information about connected clients */
1415 if (!(loops % 50)) {
1416 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1417 listLength(server.clients)-listLength(server.slaves),
1418 listLength(server.slaves),
1419 zmalloc_used_memory());
1420 }
1421
1422 /* Close connections of timedout clients */
1423 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1424 closeTimedoutClients();
1425
1426 /* Check if a background saving or AOF rewrite in progress terminated */
1427 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1428 int statloc;
1429 pid_t pid;
1430
1431 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1432 if (pid == server.bgsavechildpid) {
1433 backgroundSaveDoneHandler(statloc);
1434 } else {
1435 backgroundRewriteDoneHandler(statloc);
1436 }
1437 updateDictResizePolicy();
1438 }
1439 } else {
1440 /* If there is not a background saving in progress check if
1441 * we have to save now */
1442 time_t now = time(NULL);
1443 for (j = 0; j < server.saveparamslen; j++) {
1444 struct saveparam *sp = server.saveparams+j;
1445
1446 if (server.dirty >= sp->changes &&
1447 now-server.lastsave > sp->seconds) {
1448 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1449 sp->changes, sp->seconds);
1450 rdbSaveBackground(server.dbfilename);
1451 break;
1452 }
1453 }
1454 }
1455
1456 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1457 * will use few CPU cycles if there are few expiring keys, otherwise
1458 * it will get more aggressive to avoid that too much memory is used by
1459 * keys that can be removed from the keyspace. */
1460 for (j = 0; j < server.dbnum; j++) {
1461 int expired;
1462 redisDb *db = server.db+j;
1463
1464 /* Continue to expire if at the end of the cycle more than 25%
1465 * of the keys were expired. */
1466 do {
1467 long num = dictSize(db->expires);
1468 time_t now = time(NULL);
1469
1470 expired = 0;
1471 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1472 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1473 while (num--) {
1474 dictEntry *de;
1475 time_t t;
1476
1477 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1478 t = (time_t) dictGetEntryVal(de);
1479 if (now > t) {
1480 deleteKey(db,dictGetEntryKey(de));
1481 expired++;
1482 server.stat_expiredkeys++;
1483 }
1484 }
1485 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1486 }
1487
1488 /* Swap a few keys on disk if we are over the memory limit and VM
1489 * is enbled. Try to free objects from the free list first. */
1490 if (vmCanSwapOut()) {
1491 while (server.vm_enabled && zmalloc_used_memory() >
1492 server.vm_max_memory)
1493 {
1494 int retval;
1495
1496 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1497 retval = (server.vm_max_threads == 0) ?
1498 vmSwapOneObjectBlocking() :
1499 vmSwapOneObjectThreaded();
1500 if (retval == REDIS_ERR && !(loops % 300) &&
1501 zmalloc_used_memory() >
1502 (server.vm_max_memory+server.vm_max_memory/10))
1503 {
1504 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1505 }
1506 /* Note that when using threade I/O we free just one object,
1507 * because anyway when the I/O thread in charge to swap this
1508 * object out will finish, the handler of completed jobs
1509 * will try to swap more objects if we are still out of memory. */
1510 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1511 }
1512 }
1513
1514 /* Check if we should connect to a MASTER */
1515 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1516 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1517 if (syncWithMaster() == REDIS_OK) {
1518 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1519 if (server.appendonly) rewriteAppendOnlyFileBackground();
1520 }
1521 }
1522 return 100;
1523 }
1524
1525 /* This function gets called every time Redis is entering the
1526 * main loop of the event driven library, that is, before to sleep
1527 * for ready file descriptors. */
1528 static void beforeSleep(struct aeEventLoop *eventLoop) {
1529 REDIS_NOTUSED(eventLoop);
1530
1531 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1532 listIter li;
1533 listNode *ln;
1534
1535 listRewind(server.io_ready_clients,&li);
1536 while((ln = listNext(&li))) {
1537 redisClient *c = ln->value;
1538 struct redisCommand *cmd;
1539
1540 /* Resume the client. */
1541 listDelNode(server.io_ready_clients,ln);
1542 c->flags &= (~REDIS_IO_WAIT);
1543 server.vm_blocked_clients--;
1544 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1545 readQueryFromClient, c);
1546 cmd = lookupCommand(c->argv[0]->ptr);
1547 assert(cmd != NULL);
1548 call(c,cmd);
1549 resetClient(c);
1550 /* There may be more data to process in the input buffer. */
1551 if (c->querybuf && sdslen(c->querybuf) > 0)
1552 processInputBuffer(c);
1553 }
1554 }
1555 }
1556
1557 static void createSharedObjects(void) {
1558 int j;
1559
1560 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1561 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1562 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1563 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1564 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1565 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1566 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1567 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1568 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1569 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1570 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1571 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1572 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1573 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1574 "-ERR no such key\r\n"));
1575 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1576 "-ERR syntax error\r\n"));
1577 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1578 "-ERR source and destination objects are the same\r\n"));
1579 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1580 "-ERR index out of range\r\n"));
1581 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1582 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1583 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1584 shared.select0 = createStringObject("select 0\r\n",10);
1585 shared.select1 = createStringObject("select 1\r\n",10);
1586 shared.select2 = createStringObject("select 2\r\n",10);
1587 shared.select3 = createStringObject("select 3\r\n",10);
1588 shared.select4 = createStringObject("select 4\r\n",10);
1589 shared.select5 = createStringObject("select 5\r\n",10);
1590 shared.select6 = createStringObject("select 6\r\n",10);
1591 shared.select7 = createStringObject("select 7\r\n",10);
1592 shared.select8 = createStringObject("select 8\r\n",10);
1593 shared.select9 = createStringObject("select 9\r\n",10);
1594 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1595 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
1596 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1597 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1598 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1599 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1600 shared.mbulk3 = createStringObject("*3\r\n",4);
1601 shared.mbulk4 = createStringObject("*4\r\n",4);
1602 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1603 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1604 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1605 }
1606 }
1607
1608 static void appendServerSaveParams(time_t seconds, int changes) {
1609 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1610 server.saveparams[server.saveparamslen].seconds = seconds;
1611 server.saveparams[server.saveparamslen].changes = changes;
1612 server.saveparamslen++;
1613 }
1614
1615 static void resetServerSaveParams() {
1616 zfree(server.saveparams);
1617 server.saveparams = NULL;
1618 server.saveparamslen = 0;
1619 }
1620
1621 static void initServerConfig() {
1622 server.dbnum = REDIS_DEFAULT_DBNUM;
1623 server.port = REDIS_SERVERPORT;
1624 server.verbosity = REDIS_VERBOSE;
1625 server.maxidletime = REDIS_MAXIDLETIME;
1626 server.saveparams = NULL;
1627 server.logfile = NULL; /* NULL = log on standard output */
1628 server.bindaddr = NULL;
1629 server.glueoutputbuf = 1;
1630 server.daemonize = 0;
1631 server.appendonly = 0;
1632 server.appendfsync = APPENDFSYNC_ALWAYS;
1633 server.lastfsync = time(NULL);
1634 server.appendfd = -1;
1635 server.appendseldb = -1; /* Make sure the first time will not match */
1636 server.pidfile = zstrdup("/var/run/redis.pid");
1637 server.dbfilename = zstrdup("dump.rdb");
1638 server.appendfilename = zstrdup("appendonly.aof");
1639 server.requirepass = NULL;
1640 server.rdbcompression = 1;
1641 server.activerehashing = 1;
1642 server.maxclients = 0;
1643 server.blpop_blocked_clients = 0;
1644 server.maxmemory = 0;
1645 server.vm_enabled = 0;
1646 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1647 server.vm_page_size = 256; /* 256 bytes per page */
1648 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1649 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1650 server.vm_max_threads = 4;
1651 server.vm_blocked_clients = 0;
1652 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1653 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1654
1655 resetServerSaveParams();
1656
1657 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1658 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1659 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1660 /* Replication related */
1661 server.isslave = 0;
1662 server.masterauth = NULL;
1663 server.masterhost = NULL;
1664 server.masterport = 6379;
1665 server.master = NULL;
1666 server.replstate = REDIS_REPL_NONE;
1667
1668 /* Double constants initialization */
1669 R_Zero = 0.0;
1670 R_PosInf = 1.0/R_Zero;
1671 R_NegInf = -1.0/R_Zero;
1672 R_Nan = R_Zero/R_Zero;
1673 }
1674
1675 static void initServer() {
1676 int j;
1677
1678 signal(SIGHUP, SIG_IGN);
1679 signal(SIGPIPE, SIG_IGN);
1680 setupSigSegvAction();
1681
1682 server.devnull = fopen("/dev/null","w");
1683 if (server.devnull == NULL) {
1684 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1685 exit(1);
1686 }
1687 server.clients = listCreate();
1688 server.slaves = listCreate();
1689 server.monitors = listCreate();
1690 server.objfreelist = listCreate();
1691 createSharedObjects();
1692 server.el = aeCreateEventLoop();
1693 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1694 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1695 if (server.fd == -1) {
1696 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1697 exit(1);
1698 }
1699 for (j = 0; j < server.dbnum; j++) {
1700 server.db[j].dict = dictCreate(&dbDictType,NULL);
1701 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1702 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1703 if (server.vm_enabled)
1704 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1705 server.db[j].id = j;
1706 }
1707 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1708 server.pubsub_patterns = listCreate();
1709 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1710 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1711 server.cronloops = 0;
1712 server.bgsavechildpid = -1;
1713 server.bgrewritechildpid = -1;
1714 server.bgrewritebuf = sdsempty();
1715 server.lastsave = time(NULL);
1716 server.dirty = 0;
1717 server.stat_numcommands = 0;
1718 server.stat_numconnections = 0;
1719 server.stat_expiredkeys = 0;
1720 server.stat_starttime = time(NULL);
1721 server.unixtime = time(NULL);
1722 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1723 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1724 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1725
1726 if (server.appendonly) {
1727 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1728 if (server.appendfd == -1) {
1729 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1730 strerror(errno));
1731 exit(1);
1732 }
1733 }
1734
1735 if (server.vm_enabled) vmInit();
1736 }
1737
1738 /* Empty the whole database */
1739 static long long emptyDb() {
1740 int j;
1741 long long removed = 0;
1742
1743 for (j = 0; j < server.dbnum; j++) {
1744 removed += dictSize(server.db[j].dict);
1745 dictEmpty(server.db[j].dict);
1746 dictEmpty(server.db[j].expires);
1747 }
1748 return removed;
1749 }
1750
1751 static int yesnotoi(char *s) {
1752 if (!strcasecmp(s,"yes")) return 1;
1753 else if (!strcasecmp(s,"no")) return 0;
1754 else return -1;
1755 }
1756
1757 /* I agree, this is a very rudimental way to load a configuration...
1758 will improve later if the config gets more complex */
1759 static void loadServerConfig(char *filename) {
1760 FILE *fp;
1761 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1762 int linenum = 0;
1763 sds line = NULL;
1764
1765 if (filename[0] == '-' && filename[1] == '\0')
1766 fp = stdin;
1767 else {
1768 if ((fp = fopen(filename,"r")) == NULL) {
1769 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1770 exit(1);
1771 }
1772 }
1773
1774 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1775 sds *argv;
1776 int argc, j;
1777
1778 linenum++;
1779 line = sdsnew(buf);
1780 line = sdstrim(line," \t\r\n");
1781
1782 /* Skip comments and blank lines*/
1783 if (line[0] == '#' || line[0] == '\0') {
1784 sdsfree(line);
1785 continue;
1786 }
1787
1788 /* Split into arguments */
1789 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1790 sdstolower(argv[0]);
1791
1792 /* Execute config directives */
1793 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1794 server.maxidletime = atoi(argv[1]);
1795 if (server.maxidletime < 0) {
1796 err = "Invalid timeout value"; goto loaderr;
1797 }
1798 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1799 server.port = atoi(argv[1]);
1800 if (server.port < 1 || server.port > 65535) {
1801 err = "Invalid port"; goto loaderr;
1802 }
1803 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1804 server.bindaddr = zstrdup(argv[1]);
1805 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1806 int seconds = atoi(argv[1]);
1807 int changes = atoi(argv[2]);
1808 if (seconds < 1 || changes < 0) {
1809 err = "Invalid save parameters"; goto loaderr;
1810 }
1811 appendServerSaveParams(seconds,changes);
1812 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1813 if (chdir(argv[1]) == -1) {
1814 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1815 argv[1], strerror(errno));
1816 exit(1);
1817 }
1818 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1819 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1820 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1821 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1822 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1823 else {
1824 err = "Invalid log level. Must be one of debug, notice, warning";
1825 goto loaderr;
1826 }
1827 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1828 FILE *logfp;
1829
1830 server.logfile = zstrdup(argv[1]);
1831 if (!strcasecmp(server.logfile,"stdout")) {
1832 zfree(server.logfile);
1833 server.logfile = NULL;
1834 }
1835 if (server.logfile) {
1836 /* Test if we are able to open the file. The server will not
1837 * be able to abort just for this problem later... */
1838 logfp = fopen(server.logfile,"a");
1839 if (logfp == NULL) {
1840 err = sdscatprintf(sdsempty(),
1841 "Can't open the log file: %s", strerror(errno));
1842 goto loaderr;
1843 }
1844 fclose(logfp);
1845 }
1846 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1847 server.dbnum = atoi(argv[1]);
1848 if (server.dbnum < 1) {
1849 err = "Invalid number of databases"; goto loaderr;
1850 }
1851 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1852 loadServerConfig(argv[1]);
1853 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1854 server.maxclients = atoi(argv[1]);
1855 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1856 server.maxmemory = memtoll(argv[1],NULL);
1857 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1858 server.masterhost = sdsnew(argv[1]);
1859 server.masterport = atoi(argv[2]);
1860 server.replstate = REDIS_REPL_CONNECT;
1861 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1862 server.masterauth = zstrdup(argv[1]);
1863 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1864 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1865 err = "argument must be 'yes' or 'no'"; goto loaderr;
1866 }
1867 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1868 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1869 err = "argument must be 'yes' or 'no'"; goto loaderr;
1870 }
1871 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1872 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1873 err = "argument must be 'yes' or 'no'"; goto loaderr;
1874 }
1875 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1876 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1877 err = "argument must be 'yes' or 'no'"; goto loaderr;
1878 }
1879 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1880 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1881 err = "argument must be 'yes' or 'no'"; goto loaderr;
1882 }
1883 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1884 if (!strcasecmp(argv[1],"no")) {
1885 server.appendfsync = APPENDFSYNC_NO;
1886 } else if (!strcasecmp(argv[1],"always")) {
1887 server.appendfsync = APPENDFSYNC_ALWAYS;
1888 } else if (!strcasecmp(argv[1],"everysec")) {
1889 server.appendfsync = APPENDFSYNC_EVERYSEC;
1890 } else {
1891 err = "argument must be 'no', 'always' or 'everysec'";
1892 goto loaderr;
1893 }
1894 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1895 server.requirepass = zstrdup(argv[1]);
1896 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1897 zfree(server.pidfile);
1898 server.pidfile = zstrdup(argv[1]);
1899 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1900 zfree(server.dbfilename);
1901 server.dbfilename = zstrdup(argv[1]);
1902 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1903 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1904 err = "argument must be 'yes' or 'no'"; goto loaderr;
1905 }
1906 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1907 zfree(server.vm_swap_file);
1908 server.vm_swap_file = zstrdup(argv[1]);
1909 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1910 server.vm_max_memory = memtoll(argv[1],NULL);
1911 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1912 server.vm_page_size = memtoll(argv[1], NULL);
1913 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1914 server.vm_pages = memtoll(argv[1], NULL);
1915 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1916 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1917 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1918 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1919 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1920 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1921 } else {
1922 err = "Bad directive or wrong number of arguments"; goto loaderr;
1923 }
1924 for (j = 0; j < argc; j++)
1925 sdsfree(argv[j]);
1926 zfree(argv);
1927 sdsfree(line);
1928 }
1929 if (fp != stdin) fclose(fp);
1930 return;
1931
1932 loaderr:
1933 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1934 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1935 fprintf(stderr, ">>> '%s'\n", line);
1936 fprintf(stderr, "%s\n", err);
1937 exit(1);
1938 }
1939
1940 static void freeClientArgv(redisClient *c) {
1941 int j;
1942
1943 for (j = 0; j < c->argc; j++)
1944 decrRefCount(c->argv[j]);
1945 for (j = 0; j < c->mbargc; j++)
1946 decrRefCount(c->mbargv[j]);
1947 c->argc = 0;
1948 c->mbargc = 0;
1949 }
1950
1951 static void freeClient(redisClient *c) {
1952 listNode *ln;
1953
1954 /* Note that if the client we are freeing is blocked into a blocking
1955 * call, we have to set querybuf to NULL *before* to call
1956 * unblockClientWaitingData() to avoid processInputBuffer() will get
1957 * called. Also it is important to remove the file events after
1958 * this, because this call adds the READABLE event. */
1959 sdsfree(c->querybuf);
1960 c->querybuf = NULL;
1961 if (c->flags & REDIS_BLOCKED)
1962 unblockClientWaitingData(c);
1963
1964 /* Unsubscribe from all the pubsub channels */
1965 pubsubUnsubscribeAllChannels(c,0);
1966 pubsubUnsubscribeAllPatterns(c,0);
1967 dictRelease(c->pubsub_channels);
1968 listRelease(c->pubsub_patterns);
1969 /* Obvious cleanup */
1970 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1971 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1972 listRelease(c->reply);
1973 freeClientArgv(c);
1974 close(c->fd);
1975 /* Remove from the list of clients */
1976 ln = listSearchKey(server.clients,c);
1977 redisAssert(ln != NULL);
1978 listDelNode(server.clients,ln);
1979 /* Remove from the list of clients waiting for swapped keys */
1980 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1981 ln = listSearchKey(server.io_ready_clients,c);
1982 if (ln) {
1983 listDelNode(server.io_ready_clients,ln);
1984 server.vm_blocked_clients--;
1985 }
1986 }
1987 while (server.vm_enabled && listLength(c->io_keys)) {
1988 ln = listFirst(c->io_keys);
1989 dontWaitForSwappedKey(c,ln->value);
1990 }
1991 listRelease(c->io_keys);
1992 /* Master/slave cleanup */
1993 if (c->flags & REDIS_SLAVE) {
1994 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1995 close(c->repldbfd);
1996 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1997 ln = listSearchKey(l,c);
1998 redisAssert(ln != NULL);
1999 listDelNode(l,ln);
2000 }
2001 if (c->flags & REDIS_MASTER) {
2002 server.master = NULL;
2003 server.replstate = REDIS_REPL_CONNECT;
2004 }
2005 /* Release memory */
2006 zfree(c->argv);
2007 zfree(c->mbargv);
2008 freeClientMultiState(c);
2009 zfree(c);
2010 }
2011
2012 #define GLUEREPLY_UP_TO (1024)
2013 static void glueReplyBuffersIfNeeded(redisClient *c) {
2014 int copylen = 0;
2015 char buf[GLUEREPLY_UP_TO];
2016 listNode *ln;
2017 listIter li;
2018 robj *o;
2019
2020 listRewind(c->reply,&li);
2021 while((ln = listNext(&li))) {
2022 int objlen;
2023
2024 o = ln->value;
2025 objlen = sdslen(o->ptr);
2026 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2027 memcpy(buf+copylen,o->ptr,objlen);
2028 copylen += objlen;
2029 listDelNode(c->reply,ln);
2030 } else {
2031 if (copylen == 0) return;
2032 break;
2033 }
2034 }
2035 /* Now the output buffer is empty, add the new single element */
2036 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2037 listAddNodeHead(c->reply,o);
2038 }
2039
2040 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2041 redisClient *c = privdata;
2042 int nwritten = 0, totwritten = 0, objlen;
2043 robj *o;
2044 REDIS_NOTUSED(el);
2045 REDIS_NOTUSED(mask);
2046
2047 /* Use writev() if we have enough buffers to send */
2048 if (!server.glueoutputbuf &&
2049 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2050 !(c->flags & REDIS_MASTER))
2051 {
2052 sendReplyToClientWritev(el, fd, privdata, mask);
2053 return;
2054 }
2055
2056 while(listLength(c->reply)) {
2057 if (server.glueoutputbuf && listLength(c->reply) > 1)
2058 glueReplyBuffersIfNeeded(c);
2059
2060 o = listNodeValue(listFirst(c->reply));
2061 objlen = sdslen(o->ptr);
2062
2063 if (objlen == 0) {
2064 listDelNode(c->reply,listFirst(c->reply));
2065 continue;
2066 }
2067
2068 if (c->flags & REDIS_MASTER) {
2069 /* Don't reply to a master */
2070 nwritten = objlen - c->sentlen;
2071 } else {
2072 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2073 if (nwritten <= 0) break;
2074 }
2075 c->sentlen += nwritten;
2076 totwritten += nwritten;
2077 /* If we fully sent the object on head go to the next one */
2078 if (c->sentlen == objlen) {
2079 listDelNode(c->reply,listFirst(c->reply));
2080 c->sentlen = 0;
2081 }
2082 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2083 * bytes, in a single threaded server it's a good idea to serve
2084 * other clients as well, even if a very large request comes from
2085 * super fast link that is always able to accept data (in real world
2086 * scenario think about 'KEYS *' against the loopback interfae) */
2087 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2088 }
2089 if (nwritten == -1) {
2090 if (errno == EAGAIN) {
2091 nwritten = 0;
2092 } else {
2093 redisLog(REDIS_VERBOSE,
2094 "Error writing to client: %s", strerror(errno));
2095 freeClient(c);
2096 return;
2097 }
2098 }
2099 if (totwritten > 0) c->lastinteraction = time(NULL);
2100 if (listLength(c->reply) == 0) {
2101 c->sentlen = 0;
2102 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2103 }
2104 }
2105
2106 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2107 {
2108 redisClient *c = privdata;
2109 int nwritten = 0, totwritten = 0, objlen, willwrite;
2110 robj *o;
2111 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2112 int offset, ion = 0;
2113 REDIS_NOTUSED(el);
2114 REDIS_NOTUSED(mask);
2115
2116 listNode *node;
2117 while (listLength(c->reply)) {
2118 offset = c->sentlen;
2119 ion = 0;
2120 willwrite = 0;
2121
2122 /* fill-in the iov[] array */
2123 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2124 o = listNodeValue(node);
2125 objlen = sdslen(o->ptr);
2126
2127 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2128 break;
2129
2130 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2131 break; /* no more iovecs */
2132
2133 iov[ion].iov_base = ((char*)o->ptr) + offset;
2134 iov[ion].iov_len = objlen - offset;
2135 willwrite += objlen - offset;
2136 offset = 0; /* just for the first item */
2137 ion++;
2138 }
2139
2140 if(willwrite == 0)
2141 break;
2142
2143 /* write all collected blocks at once */
2144 if((nwritten = writev(fd, iov, ion)) < 0) {
2145 if (errno != EAGAIN) {
2146 redisLog(REDIS_VERBOSE,
2147 "Error writing to client: %s", strerror(errno));
2148 freeClient(c);
2149 return;
2150 }
2151 break;
2152 }
2153
2154 totwritten += nwritten;
2155 offset = c->sentlen;
2156
2157 /* remove written robjs from c->reply */
2158 while (nwritten && listLength(c->reply)) {
2159 o = listNodeValue(listFirst(c->reply));
2160 objlen = sdslen(o->ptr);
2161
2162 if(nwritten >= objlen - offset) {
2163 listDelNode(c->reply, listFirst(c->reply));
2164 nwritten -= objlen - offset;
2165 c->sentlen = 0;
2166 } else {
2167 /* partial write */
2168 c->sentlen += nwritten;
2169 break;
2170 }
2171 offset = 0;
2172 }
2173 }
2174
2175 if (totwritten > 0)
2176 c->lastinteraction = time(NULL);
2177
2178 if (listLength(c->reply) == 0) {
2179 c->sentlen = 0;
2180 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2181 }
2182 }
2183
2184 static struct redisCommand *lookupCommand(char *name) {
2185 int j = 0;
2186 while(cmdTable[j].name != NULL) {
2187 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2188 j++;
2189 }
2190 return NULL;
2191 }
2192
2193 /* resetClient prepare the client to process the next command */
2194 static void resetClient(redisClient *c) {
2195 freeClientArgv(c);
2196 c->bulklen = -1;
2197 c->multibulk = 0;
2198 }
2199
2200 /* Call() is the core of Redis execution of a command */
2201 static void call(redisClient *c, struct redisCommand *cmd) {
2202 long long dirty;
2203
2204 dirty = server.dirty;
2205 cmd->proc(c);
2206 dirty = server.dirty-dirty;
2207
2208 if (server.appendonly && dirty)
2209 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2210 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2211 listLength(server.slaves))
2212 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2213 if (listLength(server.monitors))
2214 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
2215 server.stat_numcommands++;
2216 }
2217
2218 /* If this function gets called we already read a whole
2219 * command, argments are in the client argv/argc fields.
2220 * processCommand() execute the command or prepare the
2221 * server for a bulk read from the client.
2222 *
2223 * If 1 is returned the client is still alive and valid and
2224 * and other operations can be performed by the caller. Otherwise
2225 * if 0 is returned the client was destroied (i.e. after QUIT). */
2226 static int processCommand(redisClient *c) {
2227 struct redisCommand *cmd;
2228
2229 /* Free some memory if needed (maxmemory setting) */
2230 if (server.maxmemory) freeMemoryIfNeeded();
2231
2232 /* Handle the multi bulk command type. This is an alternative protocol
2233 * supported by Redis in order to receive commands that are composed of
2234 * multiple binary-safe "bulk" arguments. The latency of processing is
2235 * a bit higher but this allows things like multi-sets, so if this
2236 * protocol is used only for MSET and similar commands this is a big win. */
2237 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2238 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2239 if (c->multibulk <= 0) {
2240 resetClient(c);
2241 return 1;
2242 } else {
2243 decrRefCount(c->argv[c->argc-1]);
2244 c->argc--;
2245 return 1;
2246 }
2247 } else if (c->multibulk) {
2248 if (c->bulklen == -1) {
2249 if (((char*)c->argv[0]->ptr)[0] != '$') {
2250 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2251 resetClient(c);
2252 return 1;
2253 } else {
2254 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2255 decrRefCount(c->argv[0]);
2256 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2257 c->argc--;
2258 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2259 resetClient(c);
2260 return 1;
2261 }
2262 c->argc--;
2263 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2264 return 1;
2265 }
2266 } else {
2267 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2268 c->mbargv[c->mbargc] = c->argv[0];
2269 c->mbargc++;
2270 c->argc--;
2271 c->multibulk--;
2272 if (c->multibulk == 0) {
2273 robj **auxargv;
2274 int auxargc;
2275
2276 /* Here we need to swap the multi-bulk argc/argv with the
2277 * normal argc/argv of the client structure. */
2278 auxargv = c->argv;
2279 c->argv = c->mbargv;
2280 c->mbargv = auxargv;
2281
2282 auxargc = c->argc;
2283 c->argc = c->mbargc;
2284 c->mbargc = auxargc;
2285
2286 /* We need to set bulklen to something different than -1
2287 * in order for the code below to process the command without
2288 * to try to read the last argument of a bulk command as
2289 * a special argument. */
2290 c->bulklen = 0;
2291 /* continue below and process the command */
2292 } else {
2293 c->bulklen = -1;
2294 return 1;
2295 }
2296 }
2297 }
2298 /* -- end of multi bulk commands processing -- */
2299
2300 /* The QUIT command is handled as a special case. Normal command
2301 * procs are unable to close the client connection safely */
2302 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2303 freeClient(c);
2304 return 0;
2305 }
2306
2307 /* Now lookup the command and check ASAP about trivial error conditions
2308 * such wrong arity, bad command name and so forth. */
2309 cmd = lookupCommand(c->argv[0]->ptr);
2310 if (!cmd) {
2311 addReplySds(c,
2312 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2313 (char*)c->argv[0]->ptr));
2314 resetClient(c);
2315 return 1;
2316 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2317 (c->argc < -cmd->arity)) {
2318 addReplySds(c,
2319 sdscatprintf(sdsempty(),
2320 "-ERR wrong number of arguments for '%s' command\r\n",
2321 cmd->name));
2322 resetClient(c);
2323 return 1;
2324 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2325 /* This is a bulk command, we have to read the last argument yet. */
2326 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2327
2328 decrRefCount(c->argv[c->argc-1]);
2329 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2330 c->argc--;
2331 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2332 resetClient(c);
2333 return 1;
2334 }
2335 c->argc--;
2336 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2337 /* It is possible that the bulk read is already in the
2338 * buffer. Check this condition and handle it accordingly.
2339 * This is just a fast path, alternative to call processInputBuffer().
2340 * It's a good idea since the code is small and this condition
2341 * happens most of the times. */
2342 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2343 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2344 c->argc++;
2345 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2346 } else {
2347 /* Otherwise return... there is to read the last argument
2348 * from the socket. */
2349 return 1;
2350 }
2351 }
2352 /* Let's try to encode the bulk object to save space. */
2353 if (cmd->flags & REDIS_CMD_BULK)
2354 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2355
2356 /* Check if the user is authenticated */
2357 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2358 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2359 resetClient(c);
2360 return 1;
2361 }
2362
2363 /* Handle the maxmemory directive */
2364 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2365 zmalloc_used_memory() > server.maxmemory)
2366 {
2367 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2368 resetClient(c);
2369 return 1;
2370 }
2371
2372 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2373 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2374 &&
2375 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2376 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2377 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2378 resetClient(c);
2379 return 1;
2380 }
2381
2382 /* Exec the command */
2383 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2384 queueMultiCommand(c,cmd);
2385 addReply(c,shared.queued);
2386 } else {
2387 if (server.vm_enabled && server.vm_max_threads > 0 &&
2388 blockClientOnSwappedKeys(cmd,c)) return 1;
2389 call(c,cmd);
2390 }
2391
2392 /* Prepare the client for the next command */
2393 resetClient(c);
2394 return 1;
2395 }
2396
2397 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2398 listNode *ln;
2399 listIter li;
2400 int outc = 0, j;
2401 robj **outv;
2402 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2403 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2404 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2405 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2406 robj *lenobj;
2407
2408 if (argc <= REDIS_STATIC_ARGS) {
2409 outv = static_outv;
2410 } else {
2411 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2412 }
2413
2414 lenobj = createObject(REDIS_STRING,
2415 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2416 lenobj->refcount = 0;
2417 outv[outc++] = lenobj;
2418 for (j = 0; j < argc; j++) {
2419 lenobj = createObject(REDIS_STRING,
2420 sdscatprintf(sdsempty(),"$%lu\r\n",
2421 (unsigned long) stringObjectLen(argv[j])));
2422 lenobj->refcount = 0;
2423 outv[outc++] = lenobj;
2424 outv[outc++] = argv[j];
2425 outv[outc++] = shared.crlf;
2426 }
2427
2428 /* Increment all the refcounts at start and decrement at end in order to
2429 * be sure to free objects if there is no slave in a replication state
2430 * able to be feed with commands */
2431 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2432 listRewind(slaves,&li);
2433 while((ln = listNext(&li))) {
2434 redisClient *slave = ln->value;
2435
2436 /* Don't feed slaves that are still waiting for BGSAVE to start */
2437 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2438
2439 /* Feed all the other slaves, MONITORs and so on */
2440 if (slave->slaveseldb != dictid) {
2441 robj *selectcmd;
2442
2443 switch(dictid) {
2444 case 0: selectcmd = shared.select0; break;
2445 case 1: selectcmd = shared.select1; break;
2446 case 2: selectcmd = shared.select2; break;
2447 case 3: selectcmd = shared.select3; break;
2448 case 4: selectcmd = shared.select4; break;
2449 case 5: selectcmd = shared.select5; break;
2450 case 6: selectcmd = shared.select6; break;
2451 case 7: selectcmd = shared.select7; break;
2452 case 8: selectcmd = shared.select8; break;
2453 case 9: selectcmd = shared.select9; break;
2454 default:
2455 selectcmd = createObject(REDIS_STRING,
2456 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2457 selectcmd->refcount = 0;
2458 break;
2459 }
2460 addReply(slave,selectcmd);
2461 slave->slaveseldb = dictid;
2462 }
2463 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2464 }
2465 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2466 if (outv != static_outv) zfree(outv);
2467 }
2468
2469 static sds sdscatrepr(sds s, char *p, size_t len) {
2470 s = sdscatlen(s,"\"",1);
2471 while(len--) {
2472 switch(*p) {
2473 case '\\':
2474 case '"':
2475 s = sdscatprintf(s,"\\%c",*p);
2476 break;
2477 case '\n': s = sdscatlen(s,"\\n",1); break;
2478 case '\r': s = sdscatlen(s,"\\r",1); break;
2479 case '\t': s = sdscatlen(s,"\\t",1); break;
2480 case '\a': s = sdscatlen(s,"\\a",1); break;
2481 case '\b': s = sdscatlen(s,"\\b",1); break;
2482 default:
2483 if (isprint(*p))
2484 s = sdscatprintf(s,"%c",*p);
2485 else
2486 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2487 break;
2488 }
2489 p++;
2490 }
2491 return sdscatlen(s,"\"",1);
2492 }
2493
2494 static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2495 listNode *ln;
2496 listIter li;
2497 int j;
2498 sds cmdrepr = sdsnew("+");
2499 robj *cmdobj;
2500 struct timeval tv;
2501
2502 gettimeofday(&tv,NULL);
2503 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2504 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2505
2506 for (j = 0; j < argc; j++) {
2507 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2508 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2509 } else {
2510 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2511 sdslen(argv[j]->ptr));
2512 }
2513 if (j != argc-1)
2514 cmdrepr = sdscatlen(cmdrepr," ",1);
2515 }
2516 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2517 cmdobj = createObject(REDIS_STRING,cmdrepr);
2518
2519 listRewind(monitors,&li);
2520 while((ln = listNext(&li))) {
2521 redisClient *monitor = ln->value;
2522 addReply(monitor,cmdobj);
2523 }
2524 decrRefCount(cmdobj);
2525 }
2526
2527 static void processInputBuffer(redisClient *c) {
2528 again:
2529 /* Before to process the input buffer, make sure the client is not
2530 * waitig for a blocking operation such as BLPOP. Note that the first
2531 * iteration the client is never blocked, otherwise the processInputBuffer
2532 * would not be called at all, but after the execution of the first commands
2533 * in the input buffer the client may be blocked, and the "goto again"
2534 * will try to reiterate. The following line will make it return asap. */
2535 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2536 if (c->bulklen == -1) {
2537 /* Read the first line of the query */
2538 char *p = strchr(c->querybuf,'\n');
2539 size_t querylen;
2540
2541 if (p) {
2542 sds query, *argv;
2543 int argc, j;
2544
2545 query = c->querybuf;
2546 c->querybuf = sdsempty();
2547 querylen = 1+(p-(query));
2548 if (sdslen(query) > querylen) {
2549 /* leave data after the first line of the query in the buffer */
2550 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2551 }
2552 *p = '\0'; /* remove "\n" */
2553 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2554 sdsupdatelen(query);
2555
2556 /* Now we can split the query in arguments */
2557 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2558 sdsfree(query);
2559
2560 if (c->argv) zfree(c->argv);
2561 c->argv = zmalloc(sizeof(robj*)*argc);
2562
2563 for (j = 0; j < argc; j++) {
2564 if (sdslen(argv[j])) {
2565 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2566 c->argc++;
2567 } else {
2568 sdsfree(argv[j]);
2569 }
2570 }
2571 zfree(argv);
2572 if (c->argc) {
2573 /* Execute the command. If the client is still valid
2574 * after processCommand() return and there is something
2575 * on the query buffer try to process the next command. */
2576 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2577 } else {
2578 /* Nothing to process, argc == 0. Just process the query
2579 * buffer if it's not empty or return to the caller */
2580 if (sdslen(c->querybuf)) goto again;
2581 }
2582 return;
2583 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2584 redisLog(REDIS_VERBOSE, "Client protocol error");
2585 freeClient(c);
2586 return;
2587 }
2588 } else {
2589 /* Bulk read handling. Note that if we are at this point
2590 the client already sent a command terminated with a newline,
2591 we are reading the bulk data that is actually the last
2592 argument of the command. */
2593 int qbl = sdslen(c->querybuf);
2594
2595 if (c->bulklen <= qbl) {
2596 /* Copy everything but the final CRLF as final argument */
2597 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2598 c->argc++;
2599 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2600 /* Process the command. If the client is still valid after
2601 * the processing and there is more data in the buffer
2602 * try to parse it. */
2603 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2604 return;
2605 }
2606 }
2607 }
2608
2609 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2610 redisClient *c = (redisClient*) privdata;
2611 char buf[REDIS_IOBUF_LEN];
2612 int nread;
2613 REDIS_NOTUSED(el);
2614 REDIS_NOTUSED(mask);
2615
2616 nread = read(fd, buf, REDIS_IOBUF_LEN);
2617 if (nread == -1) {
2618 if (errno == EAGAIN) {
2619 nread = 0;
2620 } else {
2621 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2622 freeClient(c);
2623 return;
2624 }
2625 } else if (nread == 0) {
2626 redisLog(REDIS_VERBOSE, "Client closed connection");
2627 freeClient(c);
2628 return;
2629 }
2630 if (nread) {
2631 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2632 c->lastinteraction = time(NULL);
2633 } else {
2634 return;
2635 }
2636 processInputBuffer(c);
2637 }
2638
2639 static int selectDb(redisClient *c, int id) {
2640 if (id < 0 || id >= server.dbnum)
2641 return REDIS_ERR;
2642 c->db = &server.db[id];
2643 return REDIS_OK;
2644 }
2645
2646 static void *dupClientReplyValue(void *o) {
2647 incrRefCount((robj*)o);
2648 return o;
2649 }
2650
2651 static int listMatchObjects(void *a, void *b) {
2652 return compareStringObjects(a,b) == 0;
2653 }
2654
2655 static redisClient *createClient(int fd) {
2656 redisClient *c = zmalloc(sizeof(*c));
2657
2658 anetNonBlock(NULL,fd);
2659 anetTcpNoDelay(NULL,fd);
2660 if (!c) return NULL;
2661 selectDb(c,0);
2662 c->fd = fd;
2663 c->querybuf = sdsempty();
2664 c->argc = 0;
2665 c->argv = NULL;
2666 c->bulklen = -1;
2667 c->multibulk = 0;
2668 c->mbargc = 0;
2669 c->mbargv = NULL;
2670 c->sentlen = 0;
2671 c->flags = 0;
2672 c->lastinteraction = time(NULL);
2673 c->authenticated = 0;
2674 c->replstate = REDIS_REPL_NONE;
2675 c->reply = listCreate();
2676 listSetFreeMethod(c->reply,decrRefCount);
2677 listSetDupMethod(c->reply,dupClientReplyValue);
2678 c->blockingkeys = NULL;
2679 c->blockingkeysnum = 0;
2680 c->io_keys = listCreate();
2681 listSetFreeMethod(c->io_keys,decrRefCount);
2682 c->pubsub_channels = dictCreate(&setDictType,NULL);
2683 c->pubsub_patterns = listCreate();
2684 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2685 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2686 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2687 readQueryFromClient, c) == AE_ERR) {
2688 freeClient(c);
2689 return NULL;
2690 }
2691 listAddNodeTail(server.clients,c);
2692 initClientMultiState(c);
2693 return c;
2694 }
2695
2696 static void addReply(redisClient *c, robj *obj) {
2697 if (listLength(c->reply) == 0 &&
2698 (c->replstate == REDIS_REPL_NONE ||
2699 c->replstate == REDIS_REPL_ONLINE) &&
2700 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2701 sendReplyToClient, c) == AE_ERR) return;
2702
2703 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2704 obj = dupStringObject(obj);
2705 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2706 }
2707 listAddNodeTail(c->reply,getDecodedObject(obj));
2708 }
2709
2710 static void addReplySds(redisClient *c, sds s) {
2711 robj *o = createObject(REDIS_STRING,s);
2712 addReply(c,o);
2713 decrRefCount(o);
2714 }
2715
2716 static void addReplyDouble(redisClient *c, double d) {
2717 char buf[128];
2718
2719 snprintf(buf,sizeof(buf),"%.17g",d);
2720 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2721 (unsigned long) strlen(buf),buf));
2722 }
2723
2724 static void addReplyLong(redisClient *c, long l) {
2725 char buf[128];
2726 size_t len;
2727
2728 if (l == 0) {
2729 addReply(c,shared.czero);
2730 return;
2731 } else if (l == 1) {
2732 addReply(c,shared.cone);
2733 return;
2734 }
2735 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2736 addReplySds(c,sdsnewlen(buf,len));
2737 }
2738
2739 static void addReplyLongLong(redisClient *c, long long ll) {
2740 char buf[128];
2741 size_t len;
2742
2743 if (ll == 0) {
2744 addReply(c,shared.czero);
2745 return;
2746 } else if (ll == 1) {
2747 addReply(c,shared.cone);
2748 return;
2749 }
2750 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2751 addReplySds(c,sdsnewlen(buf,len));
2752 }
2753
2754 static void addReplyUlong(redisClient *c, unsigned long ul) {
2755 char buf[128];
2756 size_t len;
2757
2758 if (ul == 0) {
2759 addReply(c,shared.czero);
2760 return;
2761 } else if (ul == 1) {
2762 addReply(c,shared.cone);
2763 return;
2764 }
2765 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2766 addReplySds(c,sdsnewlen(buf,len));
2767 }
2768
2769 static void addReplyBulkLen(redisClient *c, robj *obj) {
2770 size_t len;
2771
2772 if (obj->encoding == REDIS_ENCODING_RAW) {
2773 len = sdslen(obj->ptr);
2774 } else {
2775 long n = (long)obj->ptr;
2776
2777 /* Compute how many bytes will take this integer as a radix 10 string */
2778 len = 1;
2779 if (n < 0) {
2780 len++;
2781 n = -n;
2782 }
2783 while((n = n/10) != 0) {
2784 len++;
2785 }
2786 }
2787 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2788 }
2789
2790 static void addReplyBulk(redisClient *c, robj *obj) {
2791 addReplyBulkLen(c,obj);
2792 addReply(c,obj);
2793 addReply(c,shared.crlf);
2794 }
2795
2796 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2797 static void addReplyBulkCString(redisClient *c, char *s) {
2798 if (s == NULL) {
2799 addReply(c,shared.nullbulk);
2800 } else {
2801 robj *o = createStringObject(s,strlen(s));
2802 addReplyBulk(c,o);
2803 decrRefCount(o);
2804 }
2805 }
2806
2807 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2808 int cport, cfd;
2809 char cip[128];
2810 redisClient *c;
2811 REDIS_NOTUSED(el);
2812 REDIS_NOTUSED(mask);
2813 REDIS_NOTUSED(privdata);
2814
2815 cfd = anetAccept(server.neterr, fd, cip, &cport);
2816 if (cfd == AE_ERR) {
2817 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2818 return;
2819 }
2820 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2821 if ((c = createClient(cfd)) == NULL) {
2822 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2823 close(cfd); /* May be already closed, just ingore errors */
2824 return;
2825 }
2826 /* If maxclient directive is set and this is one client more... close the
2827 * connection. Note that we create the client instead to check before
2828 * for this condition, since now the socket is already set in nonblocking
2829 * mode and we can send an error for free using the Kernel I/O */
2830 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2831 char *err = "-ERR max number of clients reached\r\n";
2832
2833 /* That's a best effort error message, don't check write errors */
2834 if (write(c->fd,err,strlen(err)) == -1) {
2835 /* Nothing to do, Just to avoid the warning... */
2836 }
2837 freeClient(c);
2838 return;
2839 }
2840 server.stat_numconnections++;
2841 }
2842
2843 /* ======================= Redis objects implementation ===================== */
2844
2845 static robj *createObject(int type, void *ptr) {
2846 robj *o;
2847
2848 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2849 if (listLength(server.objfreelist)) {
2850 listNode *head = listFirst(server.objfreelist);
2851 o = listNodeValue(head);
2852 listDelNode(server.objfreelist,head);
2853 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2854 } else {
2855 if (server.vm_enabled) {
2856 pthread_mutex_unlock(&server.obj_freelist_mutex);
2857 o = zmalloc(sizeof(*o));
2858 } else {
2859 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2860 }
2861 }
2862 o->type = type;
2863 o->encoding = REDIS_ENCODING_RAW;
2864 o->ptr = ptr;
2865 o->refcount = 1;
2866 if (server.vm_enabled) {
2867 /* Note that this code may run in the context of an I/O thread
2868 * and accessing to server.unixtime in theory is an error
2869 * (no locks). But in practice this is safe, and even if we read
2870 * garbage Redis will not fail, as it's just a statistical info */
2871 o->vm.atime = server.unixtime;
2872 o->storage = REDIS_VM_MEMORY;
2873 }
2874 return o;
2875 }
2876
2877 static robj *createStringObject(char *ptr, size_t len) {
2878 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2879 }
2880
2881 static robj *createStringObjectFromLongLong(long long value) {
2882 robj *o;
2883 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2884 incrRefCount(shared.integers[value]);
2885 o = shared.integers[value];
2886 } else {
2887 o = createObject(REDIS_STRING, NULL);
2888 if (value >= LONG_MIN && value <= LONG_MAX) {
2889 o->encoding = REDIS_ENCODING_INT;
2890 o->ptr = (void*)((long)value);
2891 } else {
2892 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2893 }
2894 }
2895 return o;
2896 }
2897
2898 static robj *dupStringObject(robj *o) {
2899 assert(o->encoding == REDIS_ENCODING_RAW);
2900 return createStringObject(o->ptr,sdslen(o->ptr));
2901 }
2902
2903 static robj *createListObject(void) {
2904 list *l = listCreate();
2905
2906 listSetFreeMethod(l,decrRefCount);
2907 return createObject(REDIS_LIST,l);
2908 }
2909
2910 static robj *createSetObject(void) {
2911 dict *d = dictCreate(&setDictType,NULL);
2912 return createObject(REDIS_SET,d);
2913 }
2914
2915 static robj *createHashObject(void) {
2916 /* All the Hashes start as zipmaps. Will be automatically converted
2917 * into hash tables if there are enough elements or big elements
2918 * inside. */
2919 unsigned char *zm = zipmapNew();
2920 robj *o = createObject(REDIS_HASH,zm);
2921 o->encoding = REDIS_ENCODING_ZIPMAP;
2922 return o;
2923 }
2924
2925 static robj *createZsetObject(void) {
2926 zset *zs = zmalloc(sizeof(*zs));
2927
2928 zs->dict = dictCreate(&zsetDictType,NULL);
2929 zs->zsl = zslCreate();
2930 return createObject(REDIS_ZSET,zs);
2931 }
2932
2933 static void freeStringObject(robj *o) {
2934 if (o->encoding == REDIS_ENCODING_RAW) {
2935 sdsfree(o->ptr);
2936 }
2937 }
2938
2939 static void freeListObject(robj *o) {
2940 listRelease((list*) o->ptr);
2941 }
2942
2943 static void freeSetObject(robj *o) {
2944 dictRelease((dict*) o->ptr);
2945 }
2946
2947 static void freeZsetObject(robj *o) {
2948 zset *zs = o->ptr;
2949
2950 dictRelease(zs->dict);
2951 zslFree(zs->zsl);
2952 zfree(zs);
2953 }
2954
2955 static void freeHashObject(robj *o) {
2956 switch (o->encoding) {
2957 case REDIS_ENCODING_HT:
2958 dictRelease((dict*) o->ptr);
2959 break;
2960 case REDIS_ENCODING_ZIPMAP:
2961 zfree(o->ptr);
2962 break;
2963 default:
2964 redisPanic("Unknown hash encoding type");
2965 break;
2966 }
2967 }
2968
2969 static void incrRefCount(robj *o) {
2970 o->refcount++;
2971 }
2972
2973 static void decrRefCount(void *obj) {
2974 robj *o = obj;
2975
2976 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
2977 /* Object is a key of a swapped out value, or in the process of being
2978 * loaded. */
2979 if (server.vm_enabled &&
2980 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2981 {
2982 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2983 redisAssert(o->type == REDIS_STRING);
2984 freeStringObject(o);
2985 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2986 pthread_mutex_lock(&server.obj_freelist_mutex);
2987 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2988 !listAddNodeHead(server.objfreelist,o))
2989 zfree(o);
2990 pthread_mutex_unlock(&server.obj_freelist_mutex);
2991 server.vm_stats_swapped_objects--;
2992 return;
2993 }
2994 /* Object is in memory, or in the process of being swapped out. */
2995 if (--(o->refcount) == 0) {
2996 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2997 vmCancelThreadedIOJob(obj);
2998 switch(o->type) {
2999 case REDIS_STRING: freeStringObject(o); break;
3000 case REDIS_LIST: freeListObject(o); break;
3001 case REDIS_SET: freeSetObject(o); break;
3002 case REDIS_ZSET: freeZsetObject(o); break;
3003 case REDIS_HASH: freeHashObject(o); break;
3004 default: redisPanic("Unknown object type"); break;
3005 }
3006 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
3007 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3008 !listAddNodeHead(server.objfreelist,o))
3009 zfree(o);
3010 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
3011 }
3012 }
3013
3014 static robj *lookupKey(redisDb *db, robj *key) {
3015 dictEntry *de = dictFind(db->dict,key);
3016 if (de) {
3017 robj *key = dictGetEntryKey(de);
3018 robj *val = dictGetEntryVal(de);
3019
3020 if (server.vm_enabled) {
3021 if (key->storage == REDIS_VM_MEMORY ||
3022 key->storage == REDIS_VM_SWAPPING)
3023 {
3024 /* If we were swapping the object out, stop it, this key
3025 * was requested. */
3026 if (key->storage == REDIS_VM_SWAPPING)
3027 vmCancelThreadedIOJob(key);
3028 /* Update the access time of the key for the aging algorithm. */
3029 key->vm.atime = server.unixtime;
3030 } else {
3031 int notify = (key->storage == REDIS_VM_LOADING);
3032
3033 /* Our value was swapped on disk. Bring it at home. */
3034 redisAssert(val == NULL);
3035 val = vmLoadObject(key);
3036 dictGetEntryVal(de) = val;
3037
3038 /* Clients blocked by the VM subsystem may be waiting for
3039 * this key... */
3040 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3041 }
3042 }
3043 return val;
3044 } else {
3045 return NULL;
3046 }
3047 }
3048
3049 static robj *lookupKeyRead(redisDb *db, robj *key) {
3050 expireIfNeeded(db,key);
3051 return lookupKey(db,key);
3052 }
3053
3054 static robj *lookupKeyWrite(redisDb *db, robj *key) {
3055 deleteIfVolatile(db,key);
3056 return lookupKey(db,key);
3057 }
3058
3059 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3060 robj *o = lookupKeyRead(c->db, key);
3061 if (!o) addReply(c,reply);
3062 return o;
3063 }
3064
3065 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3066 robj *o = lookupKeyWrite(c->db, key);
3067 if (!o) addReply(c,reply);
3068 return o;
3069 }
3070
3071 static int checkType(redisClient *c, robj *o, int type) {
3072 if (o->type != type) {
3073 addReply(c,shared.wrongtypeerr);
3074 return 1;
3075 }
3076 return 0;
3077 }
3078
3079 static int deleteKey(redisDb *db, robj *key) {
3080 int retval;
3081
3082 /* We need to protect key from destruction: after the first dictDelete()
3083 * it may happen that 'key' is no longer valid if we don't increment
3084 * it's count. This may happen when we get the object reference directly
3085 * from the hash table with dictRandomKey() or dict iterators */
3086 incrRefCount(key);
3087 if (dictSize(db->expires)) dictDelete(db->expires,key);
3088 retval = dictDelete(db->dict,key);
3089 decrRefCount(key);
3090
3091 return retval == DICT_OK;
3092 }
3093
3094 /* Check if the nul-terminated string 's' can be represented by a long
3095 * (that is, is a number that fits into long without any other space or
3096 * character before or after the digits).
3097 *
3098 * If so, the function returns REDIS_OK and *longval is set to the value
3099 * of the number. Otherwise REDIS_ERR is returned */
3100 static int isStringRepresentableAsLong(sds s, long *longval) {
3101 char buf[32], *endptr;
3102 long value;
3103 int slen;
3104
3105 value = strtol(s, &endptr, 10);
3106 if (endptr[0] != '\0') return REDIS_ERR;
3107 slen = snprintf(buf,32,"%ld",value);
3108
3109 /* If the number converted back into a string is not identical
3110 * then it's not possible to encode the string as integer */
3111 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3112 if (longval) *longval = value;
3113 return REDIS_OK;
3114 }
3115
3116 /* Try to encode a string object in order to save space */
3117 static robj *tryObjectEncoding(robj *o) {
3118 long value;
3119 sds s = o->ptr;
3120
3121 if (o->encoding != REDIS_ENCODING_RAW)
3122 return o; /* Already encoded */
3123
3124 /* It's not safe to encode shared objects: shared objects can be shared
3125 * everywhere in the "object space" of Redis. Encoded objects can only
3126 * appear as "values" (and not, for instance, as keys) */
3127 if (o->refcount > 1) return o;
3128
3129 /* Currently we try to encode only strings */
3130 redisAssert(o->type == REDIS_STRING);
3131
3132 /* Check if we can represent this string as a long integer */
3133 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3134
3135 /* Ok, this object can be encoded */
3136 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3137 decrRefCount(o);
3138 incrRefCount(shared.integers[value]);
3139 return shared.integers[value];
3140 } else {
3141 o->encoding = REDIS_ENCODING_INT;
3142 sdsfree(o->ptr);
3143 o->ptr = (void*) value;
3144 return o;
3145 }
3146 }
3147
3148 /* Get a decoded version of an encoded object (returned as a new object).
3149 * If the object is already raw-encoded just increment the ref count. */
3150 static robj *getDecodedObject(robj *o) {
3151 robj *dec;
3152
3153 if (o->encoding == REDIS_ENCODING_RAW) {
3154 incrRefCount(o);
3155 return o;
3156 }
3157 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3158 char buf[32];
3159
3160 snprintf(buf,32,"%ld",(long)o->ptr);
3161 dec = createStringObject(buf,strlen(buf));
3162 return dec;
3163 } else {
3164 redisPanic("Unknown encoding type");
3165 }
3166 }
3167
3168 /* Compare two string objects via strcmp() or alike.
3169 * Note that the objects may be integer-encoded. In such a case we
3170 * use snprintf() to get a string representation of the numbers on the stack
3171 * and compare the strings, it's much faster than calling getDecodedObject().
3172 *
3173 * Important note: if objects are not integer encoded, but binary-safe strings,
3174 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3175 * binary safe. */
3176 static int compareStringObjects(robj *a, robj *b) {
3177 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3178 char bufa[128], bufb[128], *astr, *bstr;
3179 int bothsds = 1;
3180
3181 if (a == b) return 0;
3182 if (a->encoding != REDIS_ENCODING_RAW) {
3183 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3184 astr = bufa;
3185 bothsds = 0;
3186 } else {
3187 astr = a->ptr;
3188 }
3189 if (b->encoding != REDIS_ENCODING_RAW) {
3190 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3191 bstr = bufb;
3192 bothsds = 0;
3193 } else {
3194 bstr = b->ptr;
3195 }
3196 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3197 }
3198
3199 static size_t stringObjectLen(robj *o) {
3200 redisAssert(o->type == REDIS_STRING);
3201 if (o->encoding == REDIS_ENCODING_RAW) {
3202 return sdslen(o->ptr);
3203 } else {
3204 char buf[32];
3205
3206 return snprintf(buf,32,"%ld",(long)o->ptr);
3207 }
3208 }
3209
3210 static int getDoubleFromObject(robj *o, double *target) {
3211 double value;
3212 char *eptr;
3213
3214 if (o == NULL) {
3215 value = 0;
3216 } else {
3217 redisAssert(o->type == REDIS_STRING);
3218 if (o->encoding == REDIS_ENCODING_RAW) {
3219 value = strtod(o->ptr, &eptr);
3220 if (eptr[0] != '\0') return REDIS_ERR;
3221 } else if (o->encoding == REDIS_ENCODING_INT) {
3222 value = (long)o->ptr;
3223 } else {
3224 redisPanic("Unknown string encoding");
3225 }
3226 }
3227
3228 *target = value;
3229 return REDIS_OK;
3230 }
3231
3232 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3233 double value;
3234 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3235 if (msg != NULL) {
3236 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3237 } else {
3238 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3239 }
3240 return REDIS_ERR;
3241 }
3242
3243 *target = value;
3244 return REDIS_OK;
3245 }
3246
3247 static int getLongLongFromObject(robj *o, long long *target) {
3248 long long value;
3249 char *eptr;
3250
3251 if (o == NULL) {
3252 value = 0;
3253 } else {
3254 redisAssert(o->type == REDIS_STRING);
3255 if (o->encoding == REDIS_ENCODING_RAW) {
3256 value = strtoll(o->ptr, &eptr, 10);
3257 if (eptr[0] != '\0') return REDIS_ERR;
3258 } else if (o->encoding == REDIS_ENCODING_INT) {
3259 value = (long)o->ptr;
3260 } else {
3261 redisPanic("Unknown string encoding");
3262 }
3263 }
3264
3265 *target = value;
3266 return REDIS_OK;
3267 }
3268
3269 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3270 long long value;
3271 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3272 if (msg != NULL) {
3273 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3274 } else {
3275 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3276 }
3277 return REDIS_ERR;
3278 }
3279
3280 *target = value;
3281 return REDIS_OK;
3282 }
3283
3284 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3285 long long value;
3286
3287 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3288 if (value < LONG_MIN || value > LONG_MAX) {
3289 if (msg != NULL) {
3290 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3291 } else {
3292 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3293 }
3294 return REDIS_ERR;
3295 }
3296
3297 *target = value;
3298 return REDIS_OK;
3299 }
3300
3301 /*============================ RDB saving/loading =========================== */
3302
3303 static int rdbSaveType(FILE *fp, unsigned char type) {
3304 if (fwrite(&type,1,1,fp) == 0) return -1;
3305 return 0;
3306 }
3307
3308 static int rdbSaveTime(FILE *fp, time_t t) {
3309 int32_t t32 = (int32_t) t;
3310 if (fwrite(&t32,4,1,fp) == 0) return -1;
3311 return 0;
3312 }
3313
3314 /* check rdbLoadLen() comments for more info */
3315 static int rdbSaveLen(FILE *fp, uint32_t len) {
3316 unsigned char buf[2];
3317
3318 if (len < (1<<6)) {
3319 /* Save a 6 bit len */
3320 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3321 if (fwrite(buf,1,1,fp) == 0) return -1;
3322 } else if (len < (1<<14)) {
3323 /* Save a 14 bit len */
3324 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3325 buf[1] = len&0xFF;
3326 if (fwrite(buf,2,1,fp) == 0) return -1;
3327 } else {
3328 /* Save a 32 bit len */
3329 buf[0] = (REDIS_RDB_32BITLEN<<6);
3330 if (fwrite(buf,1,1,fp) == 0) return -1;
3331 len = htonl(len);
3332 if (fwrite(&len,4,1,fp) == 0) return -1;
3333 }
3334 return 0;
3335 }
3336
3337 /* String objects in the form "2391" "-100" without any space and with a
3338 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3339 * encoded as integers to save space */
3340 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3341 long long value;
3342 char *endptr, buf[32];
3343
3344 /* Check if it's possible to encode this value as a number */
3345 value = strtoll(s, &endptr, 10);
3346 if (endptr[0] != '\0') return 0;
3347 snprintf(buf,32,"%lld",value);
3348
3349 /* If the number converted back into a string is not identical
3350 * then it's not possible to encode the string as integer */
3351 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3352
3353 /* Finally check if it fits in our ranges */
3354 if (value >= -(1<<7) && value <= (1<<7)-1) {
3355 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3356 enc[1] = value&0xFF;
3357 return 2;
3358 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3359 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3360 enc[1] = value&0xFF;
3361 enc[2] = (value>>8)&0xFF;
3362 return 3;
3363 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3364 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3365 enc[1] = value&0xFF;
3366 enc[2] = (value>>8)&0xFF;
3367 enc[3] = (value>>16)&0xFF;
3368 enc[4] = (value>>24)&0xFF;
3369 return 5;
3370 } else {
3371 return 0;
3372 }
3373 }
3374
3375 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3376 size_t comprlen, outlen;
3377 unsigned char byte;
3378 void *out;
3379
3380 /* We require at least four bytes compression for this to be worth it */
3381 if (len <= 4) return 0;
3382 outlen = len-4;
3383 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3384 comprlen = lzf_compress(s, len, out, outlen);
3385 if (comprlen == 0) {
3386 zfree(out);
3387 return 0;
3388 }
3389 /* Data compressed! Let's save it on disk */
3390 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3391 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3392 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3393 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3394 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3395 zfree(out);
3396 return comprlen;
3397
3398 writeerr:
3399 zfree(out);
3400 return -1;
3401 }
3402
3403 /* Save a string objet as [len][data] on disk. If the object is a string
3404 * representation of an integer value we try to safe it in a special form */
3405 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3406 int enclen;
3407
3408 /* Try integer encoding */
3409 if (len <= 11) {
3410 unsigned char buf[5];
3411 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3412 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3413 return 0;
3414 }
3415 }
3416
3417 /* Try LZF compression - under 20 bytes it's unable to compress even
3418 * aaaaaaaaaaaaaaaaaa so skip it */
3419 if (server.rdbcompression && len > 20) {
3420 int retval;
3421
3422 retval = rdbSaveLzfStringObject(fp,s,len);
3423 if (retval == -1) return -1;
3424 if (retval > 0) return 0;
3425 /* retval == 0 means data can't be compressed, save the old way */
3426 }
3427
3428 /* Store verbatim */
3429 if (rdbSaveLen(fp,len) == -1) return -1;
3430 if (len && fwrite(s,len,1,fp) == 0) return -1;
3431 return 0;
3432 }
3433
3434 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3435 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3436 int retval;
3437
3438 /* Avoid incr/decr ref count business when possible.
3439 * This plays well with copy-on-write given that we are probably
3440 * in a child process (BGSAVE). Also this makes sure key objects
3441 * of swapped objects are not incRefCount-ed (an assert does not allow
3442 * this in order to avoid bugs) */
3443 if (obj->encoding != REDIS_ENCODING_RAW) {
3444 obj = getDecodedObject(obj);
3445 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3446 decrRefCount(obj);
3447 } else {
3448 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3449 }
3450 return retval;
3451 }
3452
3453 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3454 * 8 bit integer specifing the length of the representation.
3455 * This 8 bit integer has special values in order to specify the following
3456 * conditions:
3457 * 253: not a number
3458 * 254: + inf
3459 * 255: - inf
3460 */
3461 static int rdbSaveDoubleValue(FILE *fp, double val) {
3462 unsigned char buf[128];
3463 int len;
3464
3465 if (isnan(val)) {
3466 buf[0] = 253;
3467 len = 1;
3468 } else if (!isfinite(val)) {
3469 len = 1;
3470 buf[0] = (val < 0) ? 255 : 254;
3471 } else {
3472 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3473 buf[0] = strlen((char*)buf+1);
3474 len = buf[0]+1;
3475 }
3476 if (fwrite(buf,len,1,fp) == 0) return -1;
3477 return 0;
3478 }
3479
3480 /* Save a Redis object. */
3481 static int rdbSaveObject(FILE *fp, robj *o) {
3482 if (o->type == REDIS_STRING) {
3483 /* Save a string value */
3484 if (rdbSaveStringObject(fp,o) == -1) return -1;
3485 } else if (o->type == REDIS_LIST) {
3486 /* Save a list value */
3487 list *list = o->ptr;
3488 listIter li;
3489 listNode *ln;
3490
3491 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3492 listRewind(list,&li);
3493 while((ln = listNext(&li))) {
3494 robj *eleobj = listNodeValue(ln);
3495
3496 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3497 }
3498 } else if (o->type == REDIS_SET) {
3499 /* Save a set value */
3500 dict *set = o->ptr;
3501 dictIterator *di = dictGetIterator(set);
3502 dictEntry *de;
3503
3504 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3505 while((de = dictNext(di)) != NULL) {
3506 robj *eleobj = dictGetEntryKey(de);
3507
3508 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3509 }
3510 dictReleaseIterator(di);
3511 } else if (o->type == REDIS_ZSET) {
3512 /* Save a set value */
3513 zset *zs = o->ptr;
3514 dictIterator *di = dictGetIterator(zs->dict);
3515 dictEntry *de;
3516
3517 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3518 while((de = dictNext(di)) != NULL) {
3519 robj *eleobj = dictGetEntryKey(de);
3520 double *score = dictGetEntryVal(de);
3521
3522 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3523 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3524 }
3525 dictReleaseIterator(di);
3526 } else if (o->type == REDIS_HASH) {
3527 /* Save a hash value */
3528 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3529 unsigned char *p = zipmapRewind(o->ptr);
3530 unsigned int count = zipmapLen(o->ptr);
3531 unsigned char *key, *val;
3532 unsigned int klen, vlen;
3533
3534 if (rdbSaveLen(fp,count) == -1) return -1;
3535 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3536 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3537 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3538 }
3539 } else {
3540 dictIterator *di = dictGetIterator(o->ptr);
3541 dictEntry *de;
3542
3543 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3544 while((de = dictNext(di)) != NULL) {
3545 robj *key = dictGetEntryKey(de);
3546 robj *val = dictGetEntryVal(de);
3547
3548 if (rdbSaveStringObject(fp,key) == -1) return -1;
3549 if (rdbSaveStringObject(fp,val) == -1) return -1;
3550 }
3551 dictReleaseIterator(di);
3552 }
3553 } else {
3554 redisPanic("Unknown object type");
3555 }
3556 return 0;
3557 }
3558
3559 /* Return the length the object will have on disk if saved with
3560 * the rdbSaveObject() function. Currently we use a trick to get
3561 * this length with very little changes to the code. In the future
3562 * we could switch to a faster solution. */
3563 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3564 if (fp == NULL) fp = server.devnull;
3565 rewind(fp);
3566 assert(rdbSaveObject(fp,o) != 1);
3567 return ftello(fp);
3568 }
3569
3570 /* Return the number of pages required to save this object in the swap file */
3571 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3572 off_t bytes = rdbSavedObjectLen(o,fp);
3573
3574 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3575 }
3576
3577 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3578 static int rdbSave(char *filename) {
3579 dictIterator *di = NULL;
3580 dictEntry *de;
3581 FILE *fp;
3582 char tmpfile[256];
3583 int j;
3584 time_t now = time(NULL);
3585
3586 /* Wait for I/O therads to terminate, just in case this is a
3587 * foreground-saving, to avoid seeking the swap file descriptor at the
3588 * same time. */
3589 if (server.vm_enabled)
3590 waitEmptyIOJobsQueue();
3591
3592 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3593 fp = fopen(tmpfile,"w");
3594 if (!fp) {
3595 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3596 return REDIS_ERR;
3597 }
3598 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3599 for (j = 0; j < server.dbnum; j++) {
3600 redisDb *db = server.db+j;
3601 dict *d = db->dict;
3602 if (dictSize(d) == 0) continue;
3603 di = dictGetIterator(d);
3604 if (!di) {
3605 fclose(fp);
3606 return REDIS_ERR;
3607 }
3608
3609 /* Write the SELECT DB opcode */
3610 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3611 if (rdbSaveLen(fp,j) == -1) goto werr;
3612
3613 /* Iterate this DB writing every entry */
3614 while((de = dictNext(di)) != NULL) {
3615 robj *key = dictGetEntryKey(de);
3616 robj *o = dictGetEntryVal(de);
3617 time_t expiretime = getExpire(db,key);
3618
3619 /* Save the expire time */
3620 if (expiretime != -1) {
3621 /* If this key is already expired skip it */
3622 if (expiretime < now) continue;
3623 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3624 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3625 }
3626 /* Save the key and associated value. This requires special
3627 * handling if the value is swapped out. */
3628 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3629 key->storage == REDIS_VM_SWAPPING) {
3630 /* Save type, key, value */
3631 if (rdbSaveType(fp,o->type) == -1) goto werr;
3632 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3633 if (rdbSaveObject(fp,o) == -1) goto werr;
3634 } else {
3635 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3636 robj *po;
3637 /* Get a preview of the object in memory */
3638 po = vmPreviewObject(key);
3639 /* Save type, key, value */
3640 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3641 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3642 if (rdbSaveObject(fp,po) == -1) goto werr;
3643 /* Remove the loaded object from memory */
3644 decrRefCount(po);
3645 }
3646 }
3647 dictReleaseIterator(di);
3648 }
3649 /* EOF opcode */
3650 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3651
3652 /* Make sure data will not remain on the OS's output buffers */
3653 fflush(fp);
3654 fsync(fileno(fp));
3655 fclose(fp);
3656
3657 /* Use RENAME to make sure the DB file is changed atomically only
3658 * if the generate DB file is ok. */
3659 if (rename(tmpfile,filename) == -1) {
3660 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3661 unlink(tmpfile);
3662 return REDIS_ERR;
3663 }
3664 redisLog(REDIS_NOTICE,"DB saved on disk");
3665 server.dirty = 0;
3666 server.lastsave = time(NULL);
3667 return REDIS_OK;
3668
3669 werr:
3670 fclose(fp);
3671 unlink(tmpfile);
3672 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3673 if (di) dictReleaseIterator(di);
3674 return REDIS_ERR;
3675 }
3676
3677 static int rdbSaveBackground(char *filename) {
3678 pid_t childpid;
3679
3680 if (server.bgsavechildpid != -1) return REDIS_ERR;
3681 if (server.vm_enabled) waitEmptyIOJobsQueue();
3682 if ((childpid = fork()) == 0) {
3683 /* Child */
3684 if (server.vm_enabled) vmReopenSwapFile();
3685 close(server.fd);
3686 if (rdbSave(filename) == REDIS_OK) {
3687 _exit(0);
3688 } else {
3689 _exit(1);
3690 }
3691 } else {
3692 /* Parent */
3693 if (childpid == -1) {
3694 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3695 strerror(errno));
3696 return REDIS_ERR;
3697 }
3698 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3699 server.bgsavechildpid = childpid;
3700 updateDictResizePolicy();
3701 return REDIS_OK;
3702 }
3703 return REDIS_OK; /* unreached */
3704 }
3705
3706 static void rdbRemoveTempFile(pid_t childpid) {
3707 char tmpfile[256];
3708
3709 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3710 unlink(tmpfile);
3711 }
3712
3713 static int rdbLoadType(FILE *fp) {
3714 unsigned char type;
3715 if (fread(&type,1,1,fp) == 0) return -1;
3716 return type;
3717 }
3718
3719 static time_t rdbLoadTime(FILE *fp) {
3720 int32_t t32;
3721 if (fread(&t32,4,1,fp) == 0) return -1;
3722 return (time_t) t32;
3723 }
3724
3725 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3726 * of this file for a description of how this are stored on disk.
3727 *
3728 * isencoded is set to 1 if the readed length is not actually a length but
3729 * an "encoding type", check the above comments for more info */
3730 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3731 unsigned char buf[2];
3732 uint32_t len;
3733 int type;
3734
3735 if (isencoded) *isencoded = 0;
3736 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3737 type = (buf[0]&0xC0)>>6;
3738 if (type == REDIS_RDB_6BITLEN) {
3739 /* Read a 6 bit len */
3740 return buf[0]&0x3F;
3741 } else if (type == REDIS_RDB_ENCVAL) {
3742 /* Read a 6 bit len encoding type */
3743 if (isencoded) *isencoded = 1;
3744 return buf[0]&0x3F;
3745 } else if (type == REDIS_RDB_14BITLEN) {
3746 /* Read a 14 bit len */
3747 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3748 return ((buf[0]&0x3F)<<8)|buf[1];
3749 } else {
3750 /* Read a 32 bit len */
3751 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3752 return ntohl(len);
3753 }
3754 }
3755
3756 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3757 unsigned char enc[4];
3758 long long val;
3759
3760 if (enctype == REDIS_RDB_ENC_INT8) {
3761 if (fread(enc,1,1,fp) == 0) return NULL;
3762 val = (signed char)enc[0];
3763 } else if (enctype == REDIS_RDB_ENC_INT16) {
3764 uint16_t v;
3765 if (fread(enc,2,1,fp) == 0) return NULL;
3766 v = enc[0]|(enc[1]<<8);
3767 val = (int16_t)v;
3768 } else if (enctype == REDIS_RDB_ENC_INT32) {
3769 uint32_t v;
3770 if (fread(enc,4,1,fp) == 0) return NULL;
3771 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3772 val = (int32_t)v;
3773 } else {
3774 val = 0; /* anti-warning */
3775 redisPanic("Unknown RDB integer encoding type");
3776 }
3777 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3778 }
3779
3780 static robj *rdbLoadLzfStringObject(FILE*fp) {
3781 unsigned int len, clen;
3782 unsigned char *c = NULL;
3783 sds val = NULL;
3784
3785 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3786 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3787 if ((c = zmalloc(clen)) == NULL) goto err;
3788 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3789 if (fread(c,clen,1,fp) == 0) goto err;
3790 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3791 zfree(c);
3792 return createObject(REDIS_STRING,val);
3793 err:
3794 zfree(c);
3795 sdsfree(val);
3796 return NULL;
3797 }
3798
3799 static robj *rdbLoadStringObject(FILE*fp) {
3800 int isencoded;
3801 uint32_t len;
3802 sds val;
3803
3804 len = rdbLoadLen(fp,&isencoded);
3805 if (isencoded) {
3806 switch(len) {
3807 case REDIS_RDB_ENC_INT8:
3808 case REDIS_RDB_ENC_INT16:
3809 case REDIS_RDB_ENC_INT32:
3810 return rdbLoadIntegerObject(fp,len);
3811 case REDIS_RDB_ENC_LZF:
3812 return rdbLoadLzfStringObject(fp);
3813 default:
3814 redisPanic("Unknown RDB encoding type");
3815 }
3816 }
3817
3818 if (len == REDIS_RDB_LENERR) return NULL;
3819 val = sdsnewlen(NULL,len);
3820 if (len && fread(val,len,1,fp) == 0) {
3821 sdsfree(val);
3822 return NULL;
3823 }
3824 return createObject(REDIS_STRING,val);
3825 }
3826
3827 /* For information about double serialization check rdbSaveDoubleValue() */
3828 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3829 char buf[128];
3830 unsigned char len;
3831
3832 if (fread(&len,1,1,fp) == 0) return -1;
3833 switch(len) {
3834 case 255: *val = R_NegInf; return 0;
3835 case 254: *val = R_PosInf; return 0;
3836 case 253: *val = R_Nan; return 0;
3837 default:
3838 if (fread(buf,len,1,fp) == 0) return -1;
3839 buf[len] = '\0';
3840 sscanf(buf, "%lg", val);
3841 return 0;
3842 }
3843 }
3844
3845 /* Load a Redis object of the specified type from the specified file.
3846 * On success a newly allocated object is returned, otherwise NULL. */
3847 static robj *rdbLoadObject(int type, FILE *fp) {
3848 robj *o;
3849
3850 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3851 if (type == REDIS_STRING) {
3852 /* Read string value */
3853 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3854 o = tryObjectEncoding(o);
3855 } else if (type == REDIS_LIST || type == REDIS_SET) {
3856 /* Read list/set value */
3857 uint32_t listlen;
3858
3859 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3860 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3861 /* It's faster to expand the dict to the right size asap in order
3862 * to avoid rehashing */
3863 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3864 dictExpand(o->ptr,listlen);
3865 /* Load every single element of the list/set */
3866 while(listlen--) {
3867 robj *ele;
3868
3869 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3870 ele = tryObjectEncoding(ele);
3871 if (type == REDIS_LIST) {
3872 listAddNodeTail((list*)o->ptr,ele);
3873 } else {
3874 dictAdd((dict*)o->ptr,ele,NULL);
3875 }
3876 }
3877 } else if (type == REDIS_ZSET) {
3878 /* Read list/set value */
3879 size_t zsetlen;
3880 zset *zs;
3881
3882 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3883 o = createZsetObject();
3884 zs = o->ptr;
3885 /* Load every single element of the list/set */
3886 while(zsetlen--) {
3887 robj *ele;
3888 double *score = zmalloc(sizeof(double));
3889
3890 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3891 ele = tryObjectEncoding(ele);
3892 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3893 dictAdd(zs->dict,ele,score);
3894 zslInsert(zs->zsl,*score,ele);
3895 incrRefCount(ele); /* added to skiplist */
3896 }
3897 } else if (type == REDIS_HASH) {
3898 size_t hashlen;
3899
3900 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3901 o = createHashObject();
3902 /* Too many entries? Use an hash table. */
3903 if (hashlen > server.hash_max_zipmap_entries)
3904 convertToRealHash(o);
3905 /* Load every key/value, then set it into the zipmap or hash
3906 * table, as needed. */
3907 while(hashlen--) {
3908 robj *key, *val;
3909
3910 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3911 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3912 /* If we are using a zipmap and there are too big values
3913 * the object is converted to real hash table encoding. */
3914 if (o->encoding != REDIS_ENCODING_HT &&
3915 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3916 sdslen(val->ptr) > server.hash_max_zipmap_value))
3917 {
3918 convertToRealHash(o);
3919 }
3920
3921 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3922 unsigned char *zm = o->ptr;
3923
3924 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3925 val->ptr,sdslen(val->ptr),NULL);
3926 o->ptr = zm;
3927 decrRefCount(key);
3928 decrRefCount(val);
3929 } else {
3930 key = tryObjectEncoding(key);
3931 val = tryObjectEncoding(val);
3932 dictAdd((dict*)o->ptr,key,val);
3933 }
3934 }
3935 } else {
3936 redisPanic("Unknown object type");
3937 }
3938 return o;
3939 }
3940
3941 static int rdbLoad(char *filename) {
3942 FILE *fp;
3943 robj *keyobj = NULL;
3944 uint32_t dbid;
3945 int type, retval, rdbver;
3946 dict *d = server.db[0].dict;
3947 redisDb *db = server.db+0;
3948 char buf[1024];
3949 time_t expiretime = -1, now = time(NULL);
3950 long long loadedkeys = 0;
3951
3952 fp = fopen(filename,"r");
3953 if (!fp) return REDIS_ERR;
3954 if (fread(buf,9,1,fp) == 0) goto eoferr;
3955 buf[9] = '\0';
3956 if (memcmp(buf,"REDIS",5) != 0) {
3957 fclose(fp);
3958 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3959 return REDIS_ERR;
3960 }
3961 rdbver = atoi(buf+5);
3962 if (rdbver != 1) {
3963 fclose(fp);
3964 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3965 return REDIS_ERR;
3966 }
3967 while(1) {
3968 robj *o;
3969
3970 /* Read type. */
3971 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3972 if (type == REDIS_EXPIRETIME) {
3973 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3974 /* We read the time so we need to read the object type again */
3975 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3976 }
3977 if (type == REDIS_EOF) break;
3978 /* Handle SELECT DB opcode as a special case */
3979 if (type == REDIS_SELECTDB) {
3980 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3981 goto eoferr;
3982 if (dbid >= (unsigned)server.dbnum) {
3983 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3984 exit(1);
3985 }
3986 db = server.db+dbid;
3987 d = db->dict;
3988 continue;
3989 }
3990 /* Read key */
3991 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3992 /* Read value */
3993 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3994 /* Add the new object in the hash table */
3995 retval = dictAdd(d,keyobj,o);
3996 if (retval == DICT_ERR) {
3997 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3998 exit(1);
3999 }
4000 /* Set the expire time if needed */
4001 if (expiretime != -1) {
4002 setExpire(db,keyobj,expiretime);
4003 /* Delete this key if already expired */
4004 if (expiretime < now) deleteKey(db,keyobj);
4005 expiretime = -1;
4006 }
4007 keyobj = o = NULL;
4008 /* Handle swapping while loading big datasets when VM is on */
4009 loadedkeys++;
4010 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
4011 while (zmalloc_used_memory() > server.vm_max_memory) {
4012 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
4013 }
4014 }
4015 }
4016 fclose(fp);
4017 return REDIS_OK;
4018
4019 eoferr: /* unexpected end of file is handled here with a fatal exit */
4020 if (keyobj) decrRefCount(keyobj);
4021 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
4022 exit(1);
4023 return REDIS_ERR; /* Just to avoid warning */
4024 }
4025
4026 /*================================== Commands =============================== */
4027
4028 static void authCommand(redisClient *c) {
4029 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
4030 c->authenticated = 1;
4031 addReply(c,shared.ok);
4032 } else {
4033 c->authenticated = 0;
4034 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
4035 }
4036 }
4037
4038 static void pingCommand(redisClient *c) {
4039 addReply(c,shared.pong);
4040 }
4041
4042 static void echoCommand(redisClient *c) {
4043 addReplyBulk(c,c->argv[1]);
4044 }
4045
4046 /*=================================== Strings =============================== */
4047
4048 static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
4049 int retval;
4050 long seconds = 0; /* initialized to avoid an harmness warning */
4051
4052 if (expire) {
4053 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4054 return;
4055 if (seconds <= 0) {
4056 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4057 return;
4058 }
4059 }
4060
4061 if (nx) deleteIfVolatile(c->db,key);
4062 retval = dictAdd(c->db->dict,key,val);
4063 if (retval == DICT_ERR) {
4064 if (!nx) {
4065 /* If the key is about a swapped value, we want a new key object
4066 * to overwrite the old. So we delete the old key in the database.
4067 * This will also make sure that swap pages about the old object
4068 * will be marked as free. */
4069 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4070 incrRefCount(key);
4071 dictReplace(c->db->dict,key,val);
4072 incrRefCount(val);
4073 } else {
4074 addReply(c,shared.czero);
4075 return;
4076 }
4077 } else {
4078 incrRefCount(key);
4079 incrRefCount(val);
4080 }
4081 server.dirty++;
4082 removeExpire(c->db,key);
4083 if (expire) setExpire(c->db,key,time(NULL)+seconds);
4084 addReply(c, nx ? shared.cone : shared.ok);
4085 }
4086
4087 static void setCommand(redisClient *c) {
4088 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
4089 }
4090
4091 static void setnxCommand(redisClient *c) {
4092 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4093 }
4094
4095 static void setexCommand(redisClient *c) {
4096 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
4097 }
4098
4099 static int getGenericCommand(redisClient *c) {
4100 robj *o;
4101
4102 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4103 return REDIS_OK;
4104
4105 if (o->type != REDIS_STRING) {
4106 addReply(c,shared.wrongtypeerr);
4107 return REDIS_ERR;
4108 } else {
4109 addReplyBulk(c,o);
4110 return REDIS_OK;
4111 }
4112 }
4113
4114 static void getCommand(redisClient *c) {
4115 getGenericCommand(c);
4116 }
4117
4118 static void getsetCommand(redisClient *c) {
4119 if (getGenericCommand(c) == REDIS_ERR) return;
4120 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4121 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4122 } else {
4123 incrRefCount(c->argv[1]);
4124 }
4125 incrRefCount(c->argv[2]);
4126 server.dirty++;
4127 removeExpire(c->db,c->argv[1]);
4128 }
4129
4130 static void mgetCommand(redisClient *c) {
4131 int j;
4132
4133 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4134 for (j = 1; j < c->argc; j++) {
4135 robj *o = lookupKeyRead(c->db,c->argv[j]);
4136 if (o == NULL) {
4137 addReply(c,shared.nullbulk);
4138 } else {
4139 if (o->type != REDIS_STRING) {
4140 addReply(c,shared.nullbulk);
4141 } else {
4142 addReplyBulk(c,o);
4143 }
4144 }
4145 }
4146 }
4147
4148 static void msetGenericCommand(redisClient *c, int nx) {
4149 int j, busykeys = 0;
4150
4151 if ((c->argc % 2) == 0) {
4152 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4153 return;
4154 }
4155 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4156 * set nothing at all if at least one already key exists. */
4157 if (nx) {
4158 for (j = 1; j < c->argc; j += 2) {
4159 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4160 busykeys++;
4161 }
4162 }
4163 }
4164 if (busykeys) {
4165 addReply(c, shared.czero);
4166 return;
4167 }
4168
4169 for (j = 1; j < c->argc; j += 2) {
4170 int retval;
4171
4172 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4173 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4174 if (retval == DICT_ERR) {
4175 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4176 incrRefCount(c->argv[j+1]);
4177 } else {
4178 incrRefCount(c->argv[j]);
4179 incrRefCount(c->argv[j+1]);
4180 }
4181 removeExpire(c->db,c->argv[j]);
4182 }
4183 server.dirty += (c->argc-1)/2;
4184 addReply(c, nx ? shared.cone : shared.ok);
4185 }
4186
4187 static void msetCommand(redisClient *c) {
4188 msetGenericCommand(c,0);
4189 }
4190
4191 static void msetnxCommand(redisClient *c) {
4192 msetGenericCommand(c,1);
4193 }
4194
4195 static void incrDecrCommand(redisClient *c, long long incr) {
4196 long long value;
4197 int retval;
4198 robj *o;
4199
4200 o = lookupKeyWrite(c->db,c->argv[1]);
4201
4202 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
4203
4204 value += incr;
4205 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4206 o = tryObjectEncoding(o);
4207 retval = dictAdd(c->db->dict,c->argv[1],o);
4208 if (retval == DICT_ERR) {
4209 dictReplace(c->db->dict,c->argv[1],o);
4210 removeExpire(c->db,c->argv[1]);
4211 } else {
4212 incrRefCount(c->argv[1]);
4213 }
4214 server.dirty++;
4215 addReply(c,shared.colon);
4216 addReply(c,o);
4217 addReply(c,shared.crlf);
4218 }
4219
4220 static void incrCommand(redisClient *c) {
4221 incrDecrCommand(c,1);
4222 }
4223
4224 static void decrCommand(redisClient *c) {
4225 incrDecrCommand(c,-1);
4226 }
4227
4228 static void incrbyCommand(redisClient *c) {
4229 long long incr;
4230
4231 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4232 incrDecrCommand(c,incr);
4233 }
4234
4235 static void decrbyCommand(redisClient *c) {
4236 long long incr;
4237
4238 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4239 incrDecrCommand(c,-incr);
4240 }
4241
4242 static void appendCommand(redisClient *c) {
4243 int retval;
4244 size_t totlen;
4245 robj *o;
4246
4247 o = lookupKeyWrite(c->db,c->argv[1]);
4248 if (o == NULL) {
4249 /* Create the key */
4250 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4251 incrRefCount(c->argv[1]);
4252 incrRefCount(c->argv[2]);
4253 totlen = stringObjectLen(c->argv[2]);
4254 } else {
4255 dictEntry *de;
4256
4257 de = dictFind(c->db->dict,c->argv[1]);
4258 assert(de != NULL);
4259
4260 o = dictGetEntryVal(de);
4261 if (o->type != REDIS_STRING) {
4262 addReply(c,shared.wrongtypeerr);
4263 return;
4264 }
4265 /* If the object is specially encoded or shared we have to make
4266 * a copy */
4267 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4268 robj *decoded = getDecodedObject(o);
4269
4270 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4271 decrRefCount(decoded);
4272 dictReplace(c->db->dict,c->argv[1],o);
4273 }
4274 /* APPEND! */
4275 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4276 o->ptr = sdscatlen(o->ptr,
4277 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4278 } else {
4279 o->ptr = sdscatprintf(o->ptr, "%ld",
4280 (unsigned long) c->argv[2]->ptr);
4281 }
4282 totlen = sdslen(o->ptr);
4283 }
4284 server.dirty++;
4285 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4286 }
4287
4288 static void substrCommand(redisClient *c) {
4289 robj *o;
4290 long start = atoi(c->argv[2]->ptr);
4291 long end = atoi(c->argv[3]->ptr);
4292 size_t rangelen, strlen;
4293 sds range;
4294
4295 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4296 checkType(c,o,REDIS_STRING)) return;
4297
4298 o = getDecodedObject(o);
4299 strlen = sdslen(o->ptr);
4300
4301 /* convert negative indexes */
4302 if (start < 0) start = strlen+start;
4303 if (end < 0) end = strlen+end;
4304 if (start < 0) start = 0;
4305 if (end < 0) end = 0;
4306
4307 /* indexes sanity checks */
4308 if (start > end || (size_t)start >= strlen) {
4309 /* Out of range start or start > end result in null reply */
4310 addReply(c,shared.nullbulk);
4311 decrRefCount(o);
4312 return;
4313 }
4314 if ((size_t)end >= strlen) end = strlen-1;
4315 rangelen = (end-start)+1;
4316
4317 /* Return the result */
4318 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4319 range = sdsnewlen((char*)o->ptr+start,rangelen);
4320 addReplySds(c,range);
4321 addReply(c,shared.crlf);
4322 decrRefCount(o);
4323 }
4324
4325 /* ========================= Type agnostic commands ========================= */
4326
4327 static void delCommand(redisClient *c) {
4328 int deleted = 0, j;
4329
4330 for (j = 1; j < c->argc; j++) {
4331 if (deleteKey(c->db,c->argv[j])) {
4332 server.dirty++;
4333 deleted++;
4334 }
4335 }
4336 addReplyLong(c,deleted);
4337 }
4338
4339 static void existsCommand(redisClient *c) {
4340 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4341 }
4342
4343 static void selectCommand(redisClient *c) {
4344 int id = atoi(c->argv[1]->ptr);
4345
4346 if (selectDb(c,id) == REDIS_ERR) {
4347 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4348 } else {
4349 addReply(c,shared.ok);
4350 }
4351 }
4352
4353 static void randomkeyCommand(redisClient *c) {
4354 dictEntry *de;
4355 robj *key;
4356
4357 while(1) {
4358 de = dictGetRandomKey(c->db->dict);
4359 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4360 }
4361
4362 if (de == NULL) {
4363 addReply(c,shared.nullbulk);
4364 return;
4365 }
4366
4367 key = dictGetEntryKey(de);
4368 if (server.vm_enabled) {
4369 key = dupStringObject(key);
4370 addReplyBulk(c,key);
4371 decrRefCount(key);
4372 } else {
4373 addReplyBulk(c,key);
4374 }
4375 }
4376
4377 static void keysCommand(redisClient *c) {
4378 dictIterator *di;
4379 dictEntry *de;
4380 sds pattern = c->argv[1]->ptr;
4381 int plen = sdslen(pattern);
4382 unsigned long numkeys = 0;
4383 robj *lenobj = createObject(REDIS_STRING,NULL);
4384
4385 di = dictGetIterator(c->db->dict);
4386 addReply(c,lenobj);
4387 decrRefCount(lenobj);
4388 while((de = dictNext(di)) != NULL) {
4389 robj *keyobj = dictGetEntryKey(de);
4390
4391 sds key = keyobj->ptr;
4392 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4393 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4394 if (expireIfNeeded(c->db,keyobj) == 0) {
4395 addReplyBulk(c,keyobj);
4396 numkeys++;
4397 }
4398 }
4399 }
4400 dictReleaseIterator(di);
4401 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4402 }
4403
4404 static void dbsizeCommand(redisClient *c) {
4405 addReplySds(c,
4406 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4407 }
4408
4409 static void lastsaveCommand(redisClient *c) {
4410 addReplySds(c,
4411 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4412 }
4413
4414 static void typeCommand(redisClient *c) {
4415 robj *o;
4416 char *type;
4417
4418 o = lookupKeyRead(c->db,c->argv[1]);
4419 if (o == NULL) {
4420 type = "+none";
4421 } else {
4422 switch(o->type) {
4423 case REDIS_STRING: type = "+string"; break;
4424 case REDIS_LIST: type = "+list"; break;
4425 case REDIS_SET: type = "+set"; break;
4426 case REDIS_ZSET: type = "+zset"; break;
4427 case REDIS_HASH: type = "+hash"; break;
4428 default: type = "+unknown"; break;
4429 }
4430 }
4431 addReplySds(c,sdsnew(type));
4432 addReply(c,shared.crlf);
4433 }
4434
4435 static void saveCommand(redisClient *c) {
4436 if (server.bgsavechildpid != -1) {
4437 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4438 return;
4439 }
4440 if (rdbSave(server.dbfilename) == REDIS_OK) {
4441 addReply(c,shared.ok);
4442 } else {
4443 addReply(c,shared.err);
4444 }
4445 }
4446
4447 static void bgsaveCommand(redisClient *c) {
4448 if (server.bgsavechildpid != -1) {
4449 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4450 return;
4451 }
4452 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4453 char *status = "+Background saving started\r\n";
4454 addReplySds(c,sdsnew(status));
4455 } else {
4456 addReply(c,shared.err);
4457 }
4458 }
4459
4460 static void shutdownCommand(redisClient *c) {
4461 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4462 /* Kill the saving child if there is a background saving in progress.
4463 We want to avoid race conditions, for instance our saving child may
4464 overwrite the synchronous saving did by SHUTDOWN. */
4465 if (server.bgsavechildpid != -1) {
4466 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4467 kill(server.bgsavechildpid,SIGKILL);
4468 rdbRemoveTempFile(server.bgsavechildpid);
4469 }
4470 if (server.appendonly) {
4471 /* Append only file: fsync() the AOF and exit */
4472 fsync(server.appendfd);
4473 if (server.vm_enabled) unlink(server.vm_swap_file);
4474 exit(0);
4475 } else {
4476 /* Snapshotting. Perform a SYNC SAVE and exit */
4477 if (rdbSave(server.dbfilename) == REDIS_OK) {
4478 if (server.daemonize)
4479 unlink(server.pidfile);
4480 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4481 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4482 if (server.vm_enabled) unlink(server.vm_swap_file);
4483 exit(0);
4484 } else {
4485 /* Ooops.. error saving! The best we can do is to continue
4486 * operating. Note that if there was a background saving process,
4487 * in the next cron() Redis will be notified that the background
4488 * saving aborted, handling special stuff like slaves pending for
4489 * synchronization... */
4490 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4491 addReplySds(c,
4492 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4493 }
4494 }
4495 }
4496
4497 static void renameGenericCommand(redisClient *c, int nx) {
4498 robj *o;
4499
4500 /* To use the same key as src and dst is probably an error */
4501 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4502 addReply(c,shared.sameobjecterr);
4503 return;
4504 }
4505
4506 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4507 return;
4508
4509 incrRefCount(o);
4510 deleteIfVolatile(c->db,c->argv[2]);
4511 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4512 if (nx) {
4513 decrRefCount(o);
4514 addReply(c,shared.czero);
4515 return;
4516 }
4517 dictReplace(c->db->dict,c->argv[2],o);
4518 } else {
4519 incrRefCount(c->argv[2]);
4520 }
4521 deleteKey(c->db,c->argv[1]);
4522 server.dirty++;
4523 addReply(c,nx ? shared.cone : shared.ok);
4524 }
4525
4526 static void renameCommand(redisClient *c) {
4527 renameGenericCommand(c,0);
4528 }
4529
4530 static void renamenxCommand(redisClient *c) {
4531 renameGenericCommand(c,1);
4532 }
4533
4534 static void moveCommand(redisClient *c) {
4535 robj *o;
4536 redisDb *src, *dst;
4537 int srcid;
4538
4539 /* Obtain source and target DB pointers */
4540 src = c->db;
4541 srcid = c->db->id;
4542 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4543 addReply(c,shared.outofrangeerr);
4544 return;
4545 }
4546 dst = c->db;
4547 selectDb(c,srcid); /* Back to the source DB */
4548
4549 /* If the user is moving using as target the same
4550 * DB as the source DB it is probably an error. */
4551 if (src == dst) {
4552 addReply(c,shared.sameobjecterr);
4553 return;
4554 }
4555
4556 /* Check if the element exists and get a reference */
4557 o = lookupKeyWrite(c->db,c->argv[1]);
4558 if (!o) {
4559 addReply(c,shared.czero);
4560 return;
4561 }
4562
4563 /* Try to add the element to the target DB */
4564 deleteIfVolatile(dst,c->argv[1]);
4565 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4566 addReply(c,shared.czero);
4567 return;
4568 }
4569 incrRefCount(c->argv[1]);
4570 incrRefCount(o);
4571
4572 /* OK! key moved, free the entry in the source DB */
4573 deleteKey(src,c->argv[1]);
4574 server.dirty++;
4575 addReply(c,shared.cone);
4576 }
4577
4578 /* =================================== Lists ================================ */
4579 static void pushGenericCommand(redisClient *c, int where) {
4580 robj *lobj;
4581 list *list;
4582
4583 lobj = lookupKeyWrite(c->db,c->argv[1]);
4584 if (lobj == NULL) {
4585 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4586 addReply(c,shared.cone);
4587 return;
4588 }
4589 lobj = createListObject();
4590 list = lobj->ptr;
4591 if (where == REDIS_HEAD) {
4592 listAddNodeHead(list,c->argv[2]);
4593 } else {
4594 listAddNodeTail(list,c->argv[2]);
4595 }
4596 dictAdd(c->db->dict,c->argv[1],lobj);
4597 incrRefCount(c->argv[1]);
4598 incrRefCount(c->argv[2]);
4599 } else {
4600 if (lobj->type != REDIS_LIST) {
4601 addReply(c,shared.wrongtypeerr);
4602 return;
4603 }
4604 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4605 addReply(c,shared.cone);
4606 return;
4607 }
4608 list = lobj->ptr;
4609 if (where == REDIS_HEAD) {
4610 listAddNodeHead(list,c->argv[2]);
4611 } else {
4612 listAddNodeTail(list,c->argv[2]);
4613 }
4614 incrRefCount(c->argv[2]);
4615 }
4616 server.dirty++;
4617 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4618 }
4619
4620 static void lpushCommand(redisClient *c) {
4621 pushGenericCommand(c,REDIS_HEAD);
4622 }
4623
4624 static void rpushCommand(redisClient *c) {
4625 pushGenericCommand(c,REDIS_TAIL);
4626 }
4627
4628 static void llenCommand(redisClient *c) {
4629 robj *o;
4630 list *l;
4631
4632 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4633 checkType(c,o,REDIS_LIST)) return;
4634
4635 l = o->ptr;
4636 addReplyUlong(c,listLength(l));
4637 }
4638
4639 static void lindexCommand(redisClient *c) {
4640 robj *o;
4641 int index = atoi(c->argv[2]->ptr);
4642 list *list;
4643 listNode *ln;
4644
4645 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4646 checkType(c,o,REDIS_LIST)) return;
4647 list = o->ptr;
4648
4649 ln = listIndex(list, index);
4650 if (ln == NULL) {
4651 addReply(c,shared.nullbulk);
4652 } else {
4653 robj *ele = listNodeValue(ln);
4654 addReplyBulk(c,ele);
4655 }
4656 }
4657
4658 static void lsetCommand(redisClient *c) {
4659 robj *o;
4660 int index = atoi(c->argv[2]->ptr);
4661 list *list;
4662 listNode *ln;
4663
4664 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4665 checkType(c,o,REDIS_LIST)) return;
4666 list = o->ptr;
4667
4668 ln = listIndex(list, index);
4669 if (ln == NULL) {
4670 addReply(c,shared.outofrangeerr);
4671 } else {
4672 robj *ele = listNodeValue(ln);
4673
4674 decrRefCount(ele);
4675 listNodeValue(ln) = c->argv[3];
4676 incrRefCount(c->argv[3]);
4677 addReply(c,shared.ok);
4678 server.dirty++;
4679 }
4680 }
4681
4682 static void popGenericCommand(redisClient *c, int where) {
4683 robj *o;
4684 list *list;
4685 listNode *ln;
4686
4687 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4688 checkType(c,o,REDIS_LIST)) return;
4689 list = o->ptr;
4690
4691 if (where == REDIS_HEAD)
4692 ln = listFirst(list);
4693 else
4694 ln = listLast(list);
4695
4696 if (ln == NULL) {
4697 addReply(c,shared.nullbulk);
4698 } else {
4699 robj *ele = listNodeValue(ln);
4700 addReplyBulk(c,ele);
4701 listDelNode(list,ln);
4702 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4703 server.dirty++;
4704 }
4705 }
4706
4707 static void lpopCommand(redisClient *c) {
4708 popGenericCommand(c,REDIS_HEAD);
4709 }
4710
4711 static void rpopCommand(redisClient *c) {
4712 popGenericCommand(c,REDIS_TAIL);
4713 }
4714
4715 static void lrangeCommand(redisClient *c) {
4716 robj *o;
4717 int start = atoi(c->argv[2]->ptr);
4718 int end = atoi(c->argv[3]->ptr);
4719 int llen;
4720 int rangelen, j;
4721 list *list;
4722 listNode *ln;
4723 robj *ele;
4724
4725 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4726 || checkType(c,o,REDIS_LIST)) return;
4727 list = o->ptr;
4728 llen = listLength(list);
4729
4730 /* convert negative indexes */
4731 if (start < 0) start = llen+start;
4732 if (end < 0) end = llen+end;
4733 if (start < 0) start = 0;
4734 if (end < 0) end = 0;
4735
4736 /* indexes sanity checks */
4737 if (start > end || start >= llen) {
4738 /* Out of range start or start > end result in empty list */
4739 addReply(c,shared.emptymultibulk);
4740 return;
4741 }
4742 if (end >= llen) end = llen-1;
4743 rangelen = (end-start)+1;
4744
4745 /* Return the result in form of a multi-bulk reply */
4746 ln = listIndex(list, start);
4747 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4748 for (j = 0; j < rangelen; j++) {
4749 ele = listNodeValue(ln);
4750 addReplyBulk(c,ele);
4751 ln = ln->next;
4752 }
4753 }
4754
4755 static void ltrimCommand(redisClient *c) {
4756 robj *o;
4757 int start = atoi(c->argv[2]->ptr);
4758 int end = atoi(c->argv[3]->ptr);
4759 int llen;
4760 int j, ltrim, rtrim;
4761 list *list;
4762 listNode *ln;
4763
4764 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4765 checkType(c,o,REDIS_LIST)) return;
4766 list = o->ptr;
4767 llen = listLength(list);
4768
4769 /* convert negative indexes */
4770 if (start < 0) start = llen+start;
4771 if (end < 0) end = llen+end;
4772 if (start < 0) start = 0;
4773 if (end < 0) end = 0;
4774
4775 /* indexes sanity checks */
4776 if (start > end || start >= llen) {
4777 /* Out of range start or start > end result in empty list */
4778 ltrim = llen;
4779 rtrim = 0;
4780 } else {
4781 if (end >= llen) end = llen-1;
4782 ltrim = start;
4783 rtrim = llen-end-1;
4784 }
4785
4786 /* Remove list elements to perform the trim */
4787 for (j = 0; j < ltrim; j++) {
4788 ln = listFirst(list);
4789 listDelNode(list,ln);
4790 }
4791 for (j = 0; j < rtrim; j++) {
4792 ln = listLast(list);
4793 listDelNode(list,ln);
4794 }
4795 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4796 server.dirty++;
4797 addReply(c,shared.ok);
4798 }
4799
4800 static void lremCommand(redisClient *c) {
4801 robj *o;
4802 list *list;
4803 listNode *ln, *next;
4804 int toremove = atoi(c->argv[2]->ptr);
4805 int removed = 0;
4806 int fromtail = 0;
4807
4808 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4809 checkType(c,o,REDIS_LIST)) return;
4810 list = o->ptr;
4811
4812 if (toremove < 0) {
4813 toremove = -toremove;
4814 fromtail = 1;
4815 }
4816 ln = fromtail ? list->tail : list->head;
4817 while (ln) {
4818 robj *ele = listNodeValue(ln);
4819
4820 next = fromtail ? ln->prev : ln->next;
4821 if (compareStringObjects(ele,c->argv[3]) == 0) {
4822 listDelNode(list,ln);
4823 server.dirty++;
4824 removed++;
4825 if (toremove && removed == toremove) break;
4826 }
4827 ln = next;
4828 }
4829 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4830 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4831 }
4832
4833 /* This is the semantic of this command:
4834 * RPOPLPUSH srclist dstlist:
4835 * IF LLEN(srclist) > 0
4836 * element = RPOP srclist
4837 * LPUSH dstlist element
4838 * RETURN element
4839 * ELSE
4840 * RETURN nil
4841 * END
4842 * END
4843 *
4844 * The idea is to be able to get an element from a list in a reliable way
4845 * since the element is not just returned but pushed against another list
4846 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4847 */
4848 static void rpoplpushcommand(redisClient *c) {
4849 robj *sobj;
4850 list *srclist;
4851 listNode *ln;
4852
4853 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4854 checkType(c,sobj,REDIS_LIST)) return;
4855 srclist = sobj->ptr;
4856 ln = listLast(srclist);
4857
4858 if (ln == NULL) {
4859 addReply(c,shared.nullbulk);
4860 } else {
4861 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4862 robj *ele = listNodeValue(ln);
4863 list *dstlist;
4864
4865 if (dobj && dobj->type != REDIS_LIST) {
4866 addReply(c,shared.wrongtypeerr);
4867 return;
4868 }
4869
4870 /* Add the element to the target list (unless it's directly
4871 * passed to some BLPOP-ing client */
4872 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4873 if (dobj == NULL) {
4874 /* Create the list if the key does not exist */
4875 dobj = createListObject();
4876 dictAdd(c->db->dict,c->argv[2],dobj);
4877 incrRefCount(c->argv[2]);
4878 }
4879 dstlist = dobj->ptr;
4880 listAddNodeHead(dstlist,ele);
4881 incrRefCount(ele);
4882 }
4883
4884 /* Send the element to the client as reply as well */
4885 addReplyBulk(c,ele);
4886
4887 /* Finally remove the element from the source list */
4888 listDelNode(srclist,ln);
4889 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4890 server.dirty++;
4891 }
4892 }
4893
4894 /* ==================================== Sets ================================ */
4895
4896 static void saddCommand(redisClient *c) {
4897 robj *set;
4898
4899 set = lookupKeyWrite(c->db,c->argv[1]);
4900 if (set == NULL) {
4901 set = createSetObject();
4902 dictAdd(c->db->dict,c->argv[1],set);
4903 incrRefCount(c->argv[1]);
4904 } else {
4905 if (set->type != REDIS_SET) {
4906 addReply(c,shared.wrongtypeerr);
4907 return;
4908 }
4909 }
4910 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4911 incrRefCount(c->argv[2]);
4912 server.dirty++;
4913 addReply(c,shared.cone);
4914 } else {
4915 addReply(c,shared.czero);
4916 }
4917 }
4918
4919 static void sremCommand(redisClient *c) {
4920 robj *set;
4921
4922 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4923 checkType(c,set,REDIS_SET)) return;
4924
4925 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4926 server.dirty++;
4927 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4928 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4929 addReply(c,shared.cone);
4930 } else {
4931 addReply(c,shared.czero);
4932 }
4933 }
4934
4935 static void smoveCommand(redisClient *c) {
4936 robj *srcset, *dstset;
4937
4938 srcset = lookupKeyWrite(c->db,c->argv[1]);
4939 dstset = lookupKeyWrite(c->db,c->argv[2]);
4940
4941 /* If the source key does not exist return 0, if it's of the wrong type
4942 * raise an error */
4943 if (srcset == NULL || srcset->type != REDIS_SET) {
4944 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4945 return;
4946 }
4947 /* Error if the destination key is not a set as well */
4948 if (dstset && dstset->type != REDIS_SET) {
4949 addReply(c,shared.wrongtypeerr);
4950 return;
4951 }
4952 /* Remove the element from the source set */
4953 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4954 /* Key not found in the src set! return zero */
4955 addReply(c,shared.czero);
4956 return;
4957 }
4958 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4959 deleteKey(c->db,c->argv[1]);
4960 server.dirty++;
4961 /* Add the element to the destination set */
4962 if (!dstset) {
4963 dstset = createSetObject();
4964 dictAdd(c->db->dict,c->argv[2],dstset);
4965 incrRefCount(c->argv[2]);
4966 }
4967 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4968 incrRefCount(c->argv[3]);
4969 addReply(c,shared.cone);
4970 }
4971
4972 static void sismemberCommand(redisClient *c) {
4973 robj *set;
4974
4975 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4976 checkType(c,set,REDIS_SET)) return;
4977
4978 if (dictFind(set->ptr,c->argv[2]))
4979 addReply(c,shared.cone);
4980 else
4981 addReply(c,shared.czero);
4982 }
4983
4984 static void scardCommand(redisClient *c) {
4985 robj *o;
4986 dict *s;
4987
4988 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4989 checkType(c,o,REDIS_SET)) return;
4990
4991 s = o->ptr;
4992 addReplyUlong(c,dictSize(s));
4993 }
4994
4995 static void spopCommand(redisClient *c) {
4996 robj *set;
4997 dictEntry *de;
4998
4999 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5000 checkType(c,set,REDIS_SET)) return;
5001
5002 de = dictGetRandomKey(set->ptr);
5003 if (de == NULL) {
5004 addReply(c,shared.nullbulk);
5005 } else {
5006 robj *ele = dictGetEntryKey(de);
5007
5008 addReplyBulk(c,ele);
5009 dictDelete(set->ptr,ele);
5010 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
5011 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
5012 server.dirty++;
5013 }
5014 }
5015
5016 static void srandmemberCommand(redisClient *c) {
5017 robj *set;
5018 dictEntry *de;
5019
5020 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5021 checkType(c,set,REDIS_SET)) return;
5022
5023 de = dictGetRandomKey(set->ptr);
5024 if (de == NULL) {
5025 addReply(c,shared.nullbulk);
5026 } else {
5027 robj *ele = dictGetEntryKey(de);
5028
5029 addReplyBulk(c,ele);
5030 }
5031 }
5032
5033 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5034 dict **d1 = (void*) s1, **d2 = (void*) s2;
5035
5036 return dictSize(*d1)-dictSize(*d2);
5037 }
5038
5039 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
5040 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5041 dictIterator *di;
5042 dictEntry *de;
5043 robj *lenobj = NULL, *dstset = NULL;
5044 unsigned long j, cardinality = 0;
5045
5046 for (j = 0; j < setsnum; j++) {
5047 robj *setobj;
5048
5049 setobj = dstkey ?
5050 lookupKeyWrite(c->db,setskeys[j]) :
5051 lookupKeyRead(c->db,setskeys[j]);
5052 if (!setobj) {
5053 zfree(dv);
5054 if (dstkey) {
5055 if (deleteKey(c->db,dstkey))
5056 server.dirty++;
5057 addReply(c,shared.czero);
5058 } else {
5059 addReply(c,shared.emptymultibulk);
5060 }
5061 return;
5062 }
5063 if (setobj->type != REDIS_SET) {
5064 zfree(dv);
5065 addReply(c,shared.wrongtypeerr);
5066 return;
5067 }
5068 dv[j] = setobj->ptr;
5069 }
5070 /* Sort sets from the smallest to largest, this will improve our
5071 * algorithm's performace */
5072 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5073
5074 /* The first thing we should output is the total number of elements...
5075 * since this is a multi-bulk write, but at this stage we don't know
5076 * the intersection set size, so we use a trick, append an empty object
5077 * to the output list and save the pointer to later modify it with the
5078 * right length */
5079 if (!dstkey) {
5080 lenobj = createObject(REDIS_STRING,NULL);
5081 addReply(c,lenobj);
5082 decrRefCount(lenobj);
5083 } else {
5084 /* If we have a target key where to store the resulting set
5085 * create this key with an empty set inside */
5086 dstset = createSetObject();
5087 }
5088
5089 /* Iterate all the elements of the first (smallest) set, and test
5090 * the element against all the other sets, if at least one set does
5091 * not include the element it is discarded */
5092 di = dictGetIterator(dv[0]);
5093
5094 while((de = dictNext(di)) != NULL) {
5095 robj *ele;
5096
5097 for (j = 1; j < setsnum; j++)
5098 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5099 if (j != setsnum)
5100 continue; /* at least one set does not contain the member */
5101 ele = dictGetEntryKey(de);
5102 if (!dstkey) {
5103 addReplyBulk(c,ele);
5104 cardinality++;
5105 } else {
5106 dictAdd(dstset->ptr,ele,NULL);
5107 incrRefCount(ele);
5108 }
5109 }
5110 dictReleaseIterator(di);
5111
5112 if (dstkey) {
5113 /* Store the resulting set into the target, if the intersection
5114 * is not an empty set. */
5115 deleteKey(c->db,dstkey);
5116 if (dictSize((dict*)dstset->ptr) > 0) {
5117 dictAdd(c->db->dict,dstkey,dstset);
5118 incrRefCount(dstkey);
5119 addReplyLong(c,dictSize((dict*)dstset->ptr));
5120 } else {
5121 decrRefCount(dstset);
5122 addReply(c,shared.czero);
5123 }
5124 server.dirty++;
5125 } else {
5126 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5127 }
5128 zfree(dv);
5129 }
5130
5131 static void sinterCommand(redisClient *c) {
5132 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5133 }
5134
5135 static void sinterstoreCommand(redisClient *c) {
5136 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5137 }
5138
5139 #define REDIS_OP_UNION 0
5140 #define REDIS_OP_DIFF 1
5141 #define REDIS_OP_INTER 2
5142
5143 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5144 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5145 dictIterator *di;
5146 dictEntry *de;
5147 robj *dstset = NULL;
5148 int j, cardinality = 0;
5149
5150 for (j = 0; j < setsnum; j++) {
5151 robj *setobj;
5152
5153 setobj = dstkey ?
5154 lookupKeyWrite(c->db,setskeys[j]) :
5155 lookupKeyRead(c->db,setskeys[j]);
5156 if (!setobj) {
5157 dv[j] = NULL;
5158 continue;
5159 }
5160 if (setobj->type != REDIS_SET) {
5161 zfree(dv);
5162 addReply(c,shared.wrongtypeerr);
5163 return;
5164 }
5165 dv[j] = setobj->ptr;
5166 }
5167
5168 /* We need a temp set object to store our union. If the dstkey
5169 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5170 * this set object will be the resulting object to set into the target key*/
5171 dstset = createSetObject();
5172
5173 /* Iterate all the elements of all the sets, add every element a single
5174 * time to the result set */
5175 for (j = 0; j < setsnum; j++) {
5176 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5177 if (!dv[j]) continue; /* non existing keys are like empty sets */
5178
5179 di = dictGetIterator(dv[j]);
5180
5181 while((de = dictNext(di)) != NULL) {
5182 robj *ele;
5183
5184 /* dictAdd will not add the same element multiple times */
5185 ele = dictGetEntryKey(de);
5186 if (op == REDIS_OP_UNION || j == 0) {
5187 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5188 incrRefCount(ele);
5189 cardinality++;
5190 }
5191 } else if (op == REDIS_OP_DIFF) {
5192 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5193 cardinality--;
5194 }
5195 }
5196 }
5197 dictReleaseIterator(di);
5198
5199 /* result set is empty? Exit asap. */
5200 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5201 }
5202
5203 /* Output the content of the resulting set, if not in STORE mode */
5204 if (!dstkey) {
5205 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5206 di = dictGetIterator(dstset->ptr);
5207 while((de = dictNext(di)) != NULL) {
5208 robj *ele;
5209
5210 ele = dictGetEntryKey(de);
5211 addReplyBulk(c,ele);
5212 }
5213 dictReleaseIterator(di);
5214 decrRefCount(dstset);
5215 } else {
5216 /* If we have a target key where to store the resulting set
5217 * create this key with the result set inside */
5218 deleteKey(c->db,dstkey);
5219 if (dictSize((dict*)dstset->ptr) > 0) {
5220 dictAdd(c->db->dict,dstkey,dstset);
5221 incrRefCount(dstkey);
5222 addReplyLong(c,dictSize((dict*)dstset->ptr));
5223 } else {
5224 decrRefCount(dstset);
5225 addReply(c,shared.czero);
5226 }
5227 server.dirty++;
5228 }
5229 zfree(dv);
5230 }
5231
5232 static void sunionCommand(redisClient *c) {
5233 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5234 }
5235
5236 static void sunionstoreCommand(redisClient *c) {
5237 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5238 }
5239
5240 static void sdiffCommand(redisClient *c) {
5241 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5242 }
5243
5244 static void sdiffstoreCommand(redisClient *c) {
5245 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5246 }
5247
5248 /* ==================================== ZSets =============================== */
5249
5250 /* ZSETs are ordered sets using two data structures to hold the same elements
5251 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5252 * data structure.
5253 *
5254 * The elements are added to an hash table mapping Redis objects to scores.
5255 * At the same time the elements are added to a skip list mapping scores
5256 * to Redis objects (so objects are sorted by scores in this "view"). */
5257
5258 /* This skiplist implementation is almost a C translation of the original
5259 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5260 * Alternative to Balanced Trees", modified in three ways:
5261 * a) this implementation allows for repeated values.
5262 * b) the comparison is not just by key (our 'score') but by satellite data.
5263 * c) there is a back pointer, so it's a doubly linked list with the back
5264 * pointers being only at "level 1". This allows to traverse the list
5265 * from tail to head, useful for ZREVRANGE. */
5266
5267 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5268 zskiplistNode *zn = zmalloc(sizeof(*zn));
5269
5270 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5271 if (level > 0)
5272 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5273 zn->score = score;
5274 zn->obj = obj;
5275 return zn;
5276 }
5277
5278 static zskiplist *zslCreate(void) {
5279 int j;
5280 zskiplist *zsl;
5281
5282 zsl = zmalloc(sizeof(*zsl));
5283 zsl->level = 1;
5284 zsl->length = 0;
5285 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5286 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5287 zsl->header->forward[j] = NULL;
5288
5289 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5290 if (j < ZSKIPLIST_MAXLEVEL-1)
5291 zsl->header->span[j] = 0;
5292 }
5293 zsl->header->backward = NULL;
5294 zsl->tail = NULL;
5295 return zsl;
5296 }
5297
5298 static void zslFreeNode(zskiplistNode *node) {
5299 decrRefCount(node->obj);
5300 zfree(node->forward);
5301 zfree(node->span);
5302 zfree(node);
5303 }
5304
5305 static void zslFree(zskiplist *zsl) {
5306 zskiplistNode *node = zsl->header->forward[0], *next;
5307
5308 zfree(zsl->header->forward);
5309 zfree(zsl->header->span);
5310 zfree(zsl->header);
5311 while(node) {
5312 next = node->forward[0];
5313 zslFreeNode(node);
5314 node = next;
5315 }
5316 zfree(zsl);
5317 }
5318
5319 static int zslRandomLevel(void) {
5320 int level = 1;
5321 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5322 level += 1;
5323 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5324 }
5325
5326 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5327 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5328 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5329 int i, level;
5330
5331 x = zsl->header;
5332 for (i = zsl->level-1; i >= 0; i--) {
5333 /* store rank that is crossed to reach the insert position */
5334 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5335
5336 while (x->forward[i] &&
5337 (x->forward[i]->score < score ||
5338 (x->forward[i]->score == score &&
5339 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5340 rank[i] += i > 0 ? x->span[i-1] : 1;
5341 x = x->forward[i];
5342 }
5343 update[i] = x;
5344 }
5345 /* we assume the key is not already inside, since we allow duplicated
5346 * scores, and the re-insertion of score and redis object should never
5347 * happpen since the caller of zslInsert() should test in the hash table
5348 * if the element is already inside or not. */
5349 level = zslRandomLevel();
5350 if (level > zsl->level) {
5351 for (i = zsl->level; i < level; i++) {
5352 rank[i] = 0;
5353 update[i] = zsl->header;
5354 update[i]->span[i-1] = zsl->length;
5355 }
5356 zsl->level = level;
5357 }
5358 x = zslCreateNode(level,score,obj);
5359 for (i = 0; i < level; i++) {
5360 x->forward[i] = update[i]->forward[i];
5361 update[i]->forward[i] = x;
5362
5363 /* update span covered by update[i] as x is inserted here */
5364 if (i > 0) {
5365 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5366 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5367 }
5368 }
5369
5370 /* increment span for untouched levels */
5371 for (i = level; i < zsl->level; i++) {
5372 update[i]->span[i-1]++;
5373 }
5374
5375 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5376 if (x->forward[0])
5377 x->forward[0]->backward = x;
5378 else
5379 zsl->tail = x;
5380 zsl->length++;
5381 }
5382
5383 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5384 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5385 int i;
5386 for (i = 0; i < zsl->level; i++) {
5387 if (update[i]->forward[i] == x) {
5388 if (i > 0) {
5389 update[i]->span[i-1] += x->span[i-1] - 1;
5390 }
5391 update[i]->forward[i] = x->forward[i];
5392 } else {
5393 /* invariant: i > 0, because update[0]->forward[0]
5394 * is always equal to x */
5395 update[i]->span[i-1] -= 1;
5396 }
5397 }
5398 if (x->forward[0]) {
5399 x->forward[0]->backward = x->backward;
5400 } else {
5401 zsl->tail = x->backward;
5402 }
5403 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5404 zsl->level--;
5405 zsl->length--;
5406 }
5407
5408 /* Delete an element with matching score/object from the skiplist. */
5409 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5410 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5411 int i;
5412
5413 x = zsl->header;
5414 for (i = zsl->level-1; i >= 0; i--) {
5415 while (x->forward[i] &&
5416 (x->forward[i]->score < score ||
5417 (x->forward[i]->score == score &&
5418 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5419 x = x->forward[i];
5420 update[i] = x;
5421 }
5422 /* We may have multiple elements with the same score, what we need
5423 * is to find the element with both the right score and object. */
5424 x = x->forward[0];
5425 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5426 zslDeleteNode(zsl, x, update);
5427 zslFreeNode(x);
5428 return 1;
5429 } else {
5430 return 0; /* not found */
5431 }
5432 return 0; /* not found */
5433 }
5434
5435 /* Delete all the elements with score between min and max from the skiplist.
5436 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5437 * Note that this function takes the reference to the hash table view of the
5438 * sorted set, in order to remove the elements from the hash table too. */
5439 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5440 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5441 unsigned long removed = 0;
5442 int i;
5443
5444 x = zsl->header;
5445 for (i = zsl->level-1; i >= 0; i--) {
5446 while (x->forward[i] && x->forward[i]->score < min)
5447 x = x->forward[i];
5448 update[i] = x;
5449 }
5450 /* We may have multiple elements with the same score, what we need
5451 * is to find the element with both the right score and object. */
5452 x = x->forward[0];
5453 while (x && x->score <= max) {
5454 zskiplistNode *next = x->forward[0];
5455 zslDeleteNode(zsl, x, update);
5456 dictDelete(dict,x->obj);
5457 zslFreeNode(x);
5458 removed++;
5459 x = next;
5460 }
5461 return removed; /* not found */
5462 }
5463
5464 /* Delete all the elements with rank between start and end from the skiplist.
5465 * Start and end are inclusive. Note that start and end need to be 1-based */
5466 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5467 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5468 unsigned long traversed = 0, removed = 0;
5469 int i;
5470
5471 x = zsl->header;
5472 for (i = zsl->level-1; i >= 0; i--) {
5473 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5474 traversed += i > 0 ? x->span[i-1] : 1;
5475 x = x->forward[i];
5476 }
5477 update[i] = x;
5478 }
5479
5480 traversed++;
5481 x = x->forward[0];
5482 while (x && traversed <= end) {
5483 zskiplistNode *next = x->forward[0];
5484 zslDeleteNode(zsl, x, update);
5485 dictDelete(dict,x->obj);
5486 zslFreeNode(x);
5487 removed++;
5488 traversed++;
5489 x = next;
5490 }
5491 return removed;
5492 }
5493
5494 /* Find the first node having a score equal or greater than the specified one.
5495 * Returns NULL if there is no match. */
5496 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5497 zskiplistNode *x;
5498 int i;
5499
5500 x = zsl->header;
5501 for (i = zsl->level-1; i >= 0; i--) {
5502 while (x->forward[i] && x->forward[i]->score < score)
5503 x = x->forward[i];
5504 }
5505 /* We may have multiple elements with the same score, what we need
5506 * is to find the element with both the right score and object. */
5507 return x->forward[0];
5508 }
5509
5510 /* Find the rank for an element by both score and key.
5511 * Returns 0 when the element cannot be found, rank otherwise.
5512 * Note that the rank is 1-based due to the span of zsl->header to the
5513 * first element. */
5514 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5515 zskiplistNode *x;
5516 unsigned long rank = 0;
5517 int i;
5518
5519 x = zsl->header;
5520 for (i = zsl->level-1; i >= 0; i--) {
5521 while (x->forward[i] &&
5522 (x->forward[i]->score < score ||
5523 (x->forward[i]->score == score &&
5524 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5525 rank += i > 0 ? x->span[i-1] : 1;
5526 x = x->forward[i];
5527 }
5528
5529 /* x might be equal to zsl->header, so test if obj is non-NULL */
5530 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5531 return rank;
5532 }
5533 }
5534 return 0;
5535 }
5536
5537 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5538 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5539 zskiplistNode *x;
5540 unsigned long traversed = 0;
5541 int i;
5542
5543 x = zsl->header;
5544 for (i = zsl->level-1; i >= 0; i--) {
5545 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5546 {
5547 traversed += i > 0 ? x->span[i-1] : 1;
5548 x = x->forward[i];
5549 }
5550 if (traversed == rank) {
5551 return x;
5552 }
5553 }
5554 return NULL;
5555 }
5556
5557 /* The actual Z-commands implementations */
5558
5559 /* This generic command implements both ZADD and ZINCRBY.
5560 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5561 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5562 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5563 robj *zsetobj;
5564 zset *zs;
5565 double *score;
5566
5567 zsetobj = lookupKeyWrite(c->db,key);
5568 if (zsetobj == NULL) {
5569 zsetobj = createZsetObject();
5570 dictAdd(c->db->dict,key,zsetobj);
5571 incrRefCount(key);
5572 } else {
5573 if (zsetobj->type != REDIS_ZSET) {
5574 addReply(c,shared.wrongtypeerr);
5575 return;
5576 }
5577 }
5578 zs = zsetobj->ptr;
5579
5580 /* Ok now since we implement both ZADD and ZINCRBY here the code
5581 * needs to handle the two different conditions. It's all about setting
5582 * '*score', that is, the new score to set, to the right value. */
5583 score = zmalloc(sizeof(double));
5584 if (doincrement) {
5585 dictEntry *de;
5586
5587 /* Read the old score. If the element was not present starts from 0 */
5588 de = dictFind(zs->dict,ele);
5589 if (de) {
5590 double *oldscore = dictGetEntryVal(de);
5591 *score = *oldscore + scoreval;
5592 } else {
5593 *score = scoreval;
5594 }
5595 } else {
5596 *score = scoreval;
5597 }
5598
5599 /* What follows is a simple remove and re-insert operation that is common
5600 * to both ZADD and ZINCRBY... */
5601 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5602 /* case 1: New element */
5603 incrRefCount(ele); /* added to hash */
5604 zslInsert(zs->zsl,*score,ele);
5605 incrRefCount(ele); /* added to skiplist */
5606 server.dirty++;
5607 if (doincrement)
5608 addReplyDouble(c,*score);
5609 else
5610 addReply(c,shared.cone);
5611 } else {
5612 dictEntry *de;
5613 double *oldscore;
5614
5615 /* case 2: Score update operation */
5616 de = dictFind(zs->dict,ele);
5617 redisAssert(de != NULL);
5618 oldscore = dictGetEntryVal(de);
5619 if (*score != *oldscore) {
5620 int deleted;
5621
5622 /* Remove and insert the element in the skip list with new score */
5623 deleted = zslDelete(zs->zsl,*oldscore,ele);
5624 redisAssert(deleted != 0);
5625 zslInsert(zs->zsl,*score,ele);
5626 incrRefCount(ele);
5627 /* Update the score in the hash table */
5628 dictReplace(zs->dict,ele,score);
5629 server.dirty++;
5630 } else {
5631 zfree(score);
5632 }
5633 if (doincrement)
5634 addReplyDouble(c,*score);
5635 else
5636 addReply(c,shared.czero);
5637 }
5638 }
5639
5640 static void zaddCommand(redisClient *c) {
5641 double scoreval;
5642
5643 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5644 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5645 }
5646
5647 static void zincrbyCommand(redisClient *c) {
5648 double scoreval;
5649
5650 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5651 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5652 }
5653
5654 static void zremCommand(redisClient *c) {
5655 robj *zsetobj;
5656 zset *zs;
5657 dictEntry *de;
5658 double *oldscore;
5659 int deleted;
5660
5661 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5662 checkType(c,zsetobj,REDIS_ZSET)) return;
5663
5664 zs = zsetobj->ptr;
5665 de = dictFind(zs->dict,c->argv[2]);
5666 if (de == NULL) {
5667 addReply(c,shared.czero);
5668 return;
5669 }
5670 /* Delete from the skiplist */
5671 oldscore = dictGetEntryVal(de);
5672 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5673 redisAssert(deleted != 0);
5674
5675 /* Delete from the hash table */
5676 dictDelete(zs->dict,c->argv[2]);
5677 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5678 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5679 server.dirty++;
5680 addReply(c,shared.cone);
5681 }
5682
5683 static void zremrangebyscoreCommand(redisClient *c) {
5684 double min;
5685 double max;
5686 long deleted;
5687 robj *zsetobj;
5688 zset *zs;
5689
5690 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5691 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5692
5693 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5694 checkType(c,zsetobj,REDIS_ZSET)) return;
5695
5696 zs = zsetobj->ptr;
5697 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5698 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5699 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5700 server.dirty += deleted;
5701 addReplyLong(c,deleted);
5702 }
5703
5704 static void zremrangebyrankCommand(redisClient *c) {
5705 long start;
5706 long end;
5707 int llen;
5708 long deleted;
5709 robj *zsetobj;
5710 zset *zs;
5711
5712 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5713 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5714
5715 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5716 checkType(c,zsetobj,REDIS_ZSET)) return;
5717 zs = zsetobj->ptr;
5718 llen = zs->zsl->length;
5719
5720 /* convert negative indexes */
5721 if (start < 0) start = llen+start;
5722 if (end < 0) end = llen+end;
5723 if (start < 0) start = 0;
5724 if (end < 0) end = 0;
5725
5726 /* indexes sanity checks */
5727 if (start > end || start >= llen) {
5728 addReply(c,shared.czero);
5729 return;
5730 }
5731 if (end >= llen) end = llen-1;
5732
5733 /* increment start and end because zsl*Rank functions
5734 * use 1-based rank */
5735 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5736 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5737 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5738 server.dirty += deleted;
5739 addReplyLong(c, deleted);
5740 }
5741
5742 typedef struct {
5743 dict *dict;
5744 double weight;
5745 } zsetopsrc;
5746
5747 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5748 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5749 unsigned long size1, size2;
5750 size1 = d1->dict ? dictSize(d1->dict) : 0;
5751 size2 = d2->dict ? dictSize(d2->dict) : 0;
5752 return size1 - size2;
5753 }
5754
5755 #define REDIS_AGGR_SUM 1
5756 #define REDIS_AGGR_MIN 2
5757 #define REDIS_AGGR_MAX 3
5758
5759 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5760 if (aggregate == REDIS_AGGR_SUM) {
5761 *target = *target + val;
5762 } else if (aggregate == REDIS_AGGR_MIN) {
5763 *target = val < *target ? val : *target;
5764 } else if (aggregate == REDIS_AGGR_MAX) {
5765 *target = val > *target ? val : *target;
5766 } else {
5767 /* safety net */
5768 redisPanic("Unknown ZUNION/INTER aggregate type");
5769 }
5770 }
5771
5772 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5773 int i, j, zsetnum;
5774 int aggregate = REDIS_AGGR_SUM;
5775 zsetopsrc *src;
5776 robj *dstobj;
5777 zset *dstzset;
5778 dictIterator *di;
5779 dictEntry *de;
5780
5781 /* expect zsetnum input keys to be given */
5782 zsetnum = atoi(c->argv[2]->ptr);
5783 if (zsetnum < 1) {
5784 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5785 return;
5786 }
5787
5788 /* test if the expected number of keys would overflow */
5789 if (3+zsetnum > c->argc) {
5790 addReply(c,shared.syntaxerr);
5791 return;
5792 }
5793
5794 /* read keys to be used for input */
5795 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5796 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5797 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5798 if (!zsetobj) {
5799 src[i].dict = NULL;
5800 } else {
5801 if (zsetobj->type != REDIS_ZSET) {
5802 zfree(src);
5803 addReply(c,shared.wrongtypeerr);
5804 return;
5805 }
5806 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5807 }
5808
5809 /* default all weights to 1 */
5810 src[i].weight = 1.0;
5811 }
5812
5813 /* parse optional extra arguments */
5814 if (j < c->argc) {
5815 int remaining = c->argc - j;
5816
5817 while (remaining) {
5818 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5819 j++; remaining--;
5820 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5821 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5822 return;
5823 }
5824 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5825 j++; remaining--;
5826 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5827 aggregate = REDIS_AGGR_SUM;
5828 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5829 aggregate = REDIS_AGGR_MIN;
5830 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5831 aggregate = REDIS_AGGR_MAX;
5832 } else {
5833 zfree(src);
5834 addReply(c,shared.syntaxerr);
5835 return;
5836 }
5837 j++; remaining--;
5838 } else {
5839 zfree(src);
5840 addReply(c,shared.syntaxerr);
5841 return;
5842 }
5843 }
5844 }
5845
5846 /* sort sets from the smallest to largest, this will improve our
5847 * algorithm's performance */
5848 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5849
5850 dstobj = createZsetObject();
5851 dstzset = dstobj->ptr;
5852
5853 if (op == REDIS_OP_INTER) {
5854 /* skip going over all entries if the smallest zset is NULL or empty */
5855 if (src[0].dict && dictSize(src[0].dict) > 0) {
5856 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5857 * from small to large, all src[i > 0].dict are non-empty too */
5858 di = dictGetIterator(src[0].dict);
5859 while((de = dictNext(di)) != NULL) {
5860 double *score = zmalloc(sizeof(double)), value;
5861 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5862
5863 for (j = 1; j < zsetnum; j++) {
5864 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5865 if (other) {
5866 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5867 zunionInterAggregate(score, value, aggregate);
5868 } else {
5869 break;
5870 }
5871 }
5872
5873 /* skip entry when not present in every source dict */
5874 if (j != zsetnum) {
5875 zfree(score);
5876 } else {
5877 robj *o = dictGetEntryKey(de);
5878 dictAdd(dstzset->dict,o,score);
5879 incrRefCount(o); /* added to dictionary */
5880 zslInsert(dstzset->zsl,*score,o);
5881 incrRefCount(o); /* added to skiplist */
5882 }
5883 }
5884 dictReleaseIterator(di);
5885 }
5886 } else if (op == REDIS_OP_UNION) {
5887 for (i = 0; i < zsetnum; i++) {
5888 if (!src[i].dict) continue;
5889
5890 di = dictGetIterator(src[i].dict);
5891 while((de = dictNext(di)) != NULL) {
5892 /* skip key when already processed */
5893 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5894
5895 double *score = zmalloc(sizeof(double)), value;
5896 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5897
5898 /* because the zsets are sorted by size, its only possible
5899 * for sets at larger indices to hold this entry */
5900 for (j = (i+1); j < zsetnum; j++) {
5901 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5902 if (other) {
5903 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5904 zunionInterAggregate(score, value, aggregate);
5905 }
5906 }
5907
5908 robj *o = dictGetEntryKey(de);
5909 dictAdd(dstzset->dict,o,score);
5910 incrRefCount(o); /* added to dictionary */
5911 zslInsert(dstzset->zsl,*score,o);
5912 incrRefCount(o); /* added to skiplist */
5913 }
5914 dictReleaseIterator(di);
5915 }
5916 } else {
5917 /* unknown operator */
5918 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5919 }
5920
5921 deleteKey(c->db,dstkey);
5922 if (dstzset->zsl->length) {
5923 dictAdd(c->db->dict,dstkey,dstobj);
5924 incrRefCount(dstkey);
5925 addReplyLong(c, dstzset->zsl->length);
5926 server.dirty++;
5927 } else {
5928 decrRefCount(dstobj);
5929 addReply(c, shared.czero);
5930 }
5931 zfree(src);
5932 }
5933
5934 static void zunionCommand(redisClient *c) {
5935 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5936 }
5937
5938 static void zinterCommand(redisClient *c) {
5939 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5940 }
5941
5942 static void zrangeGenericCommand(redisClient *c, int reverse) {
5943 robj *o;
5944 long start;
5945 long end;
5946 int withscores = 0;
5947 int llen;
5948 int rangelen, j;
5949 zset *zsetobj;
5950 zskiplist *zsl;
5951 zskiplistNode *ln;
5952 robj *ele;
5953
5954 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5955 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5956
5957 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5958 withscores = 1;
5959 } else if (c->argc >= 5) {
5960 addReply(c,shared.syntaxerr);
5961 return;
5962 }
5963
5964 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5965 || checkType(c,o,REDIS_ZSET)) return;
5966 zsetobj = o->ptr;
5967 zsl = zsetobj->zsl;
5968 llen = zsl->length;
5969
5970 /* convert negative indexes */
5971 if (start < 0) start = llen+start;
5972 if (end < 0) end = llen+end;
5973 if (start < 0) start = 0;
5974 if (end < 0) end = 0;
5975
5976 /* indexes sanity checks */
5977 if (start > end || start >= llen) {
5978 /* Out of range start or start > end result in empty list */
5979 addReply(c,shared.emptymultibulk);
5980 return;
5981 }
5982 if (end >= llen) end = llen-1;
5983 rangelen = (end-start)+1;
5984
5985 /* check if starting point is trivial, before searching
5986 * the element in log(N) time */
5987 if (reverse) {
5988 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5989 } else {
5990 ln = start == 0 ?
5991 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5992 }
5993
5994 /* Return the result in form of a multi-bulk reply */
5995 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5996 withscores ? (rangelen*2) : rangelen));
5997 for (j = 0; j < rangelen; j++) {
5998 ele = ln->obj;
5999 addReplyBulk(c,ele);
6000 if (withscores)
6001 addReplyDouble(c,ln->score);
6002 ln = reverse ? ln->backward : ln->forward[0];
6003 }
6004 }
6005
6006 static void zrangeCommand(redisClient *c) {
6007 zrangeGenericCommand(c,0);
6008 }
6009
6010 static void zrevrangeCommand(redisClient *c) {
6011 zrangeGenericCommand(c,1);
6012 }
6013
6014 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
6015 * If justcount is non-zero, just the count is returned. */
6016 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
6017 robj *o;
6018 double min, max;
6019 int minex = 0, maxex = 0; /* are min or max exclusive? */
6020 int offset = 0, limit = -1;
6021 int withscores = 0;
6022 int badsyntax = 0;
6023
6024 /* Parse the min-max interval. If one of the values is prefixed
6025 * by the "(" character, it's considered "open". For instance
6026 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6027 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6028 if (((char*)c->argv[2]->ptr)[0] == '(') {
6029 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6030 minex = 1;
6031 } else {
6032 min = strtod(c->argv[2]->ptr,NULL);
6033 }
6034 if (((char*)c->argv[3]->ptr)[0] == '(') {
6035 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6036 maxex = 1;
6037 } else {
6038 max = strtod(c->argv[3]->ptr,NULL);
6039 }
6040
6041 /* Parse "WITHSCORES": note that if the command was called with
6042 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6043 * enter the following paths to parse WITHSCORES and LIMIT. */
6044 if (c->argc == 5 || c->argc == 8) {
6045 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6046 withscores = 1;
6047 else
6048 badsyntax = 1;
6049 }
6050 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
6051 badsyntax = 1;
6052 if (badsyntax) {
6053 addReplySds(c,
6054 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
6055 return;
6056 }
6057
6058 /* Parse "LIMIT" */
6059 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
6060 addReply(c,shared.syntaxerr);
6061 return;
6062 } else if (c->argc == (7 + withscores)) {
6063 offset = atoi(c->argv[5]->ptr);
6064 limit = atoi(c->argv[6]->ptr);
6065 if (offset < 0) offset = 0;
6066 }
6067
6068 /* Ok, lookup the key and get the range */
6069 o = lookupKeyRead(c->db,c->argv[1]);
6070 if (o == NULL) {
6071 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6072 } else {
6073 if (o->type != REDIS_ZSET) {
6074 addReply(c,shared.wrongtypeerr);
6075 } else {
6076 zset *zsetobj = o->ptr;
6077 zskiplist *zsl = zsetobj->zsl;
6078 zskiplistNode *ln;
6079 robj *ele, *lenobj = NULL;
6080 unsigned long rangelen = 0;
6081
6082 /* Get the first node with the score >= min, or with
6083 * score > min if 'minex' is true. */
6084 ln = zslFirstWithScore(zsl,min);
6085 while (minex && ln && ln->score == min) ln = ln->forward[0];
6086
6087 if (ln == NULL) {
6088 /* No element matching the speciifed interval */
6089 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6090 return;
6091 }
6092
6093 /* We don't know in advance how many matching elements there
6094 * are in the list, so we push this object that will represent
6095 * the multi-bulk length in the output buffer, and will "fix"
6096 * it later */
6097 if (!justcount) {
6098 lenobj = createObject(REDIS_STRING,NULL);
6099 addReply(c,lenobj);
6100 decrRefCount(lenobj);
6101 }
6102
6103 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6104 if (offset) {
6105 offset--;
6106 ln = ln->forward[0];
6107 continue;
6108 }
6109 if (limit == 0) break;
6110 if (!justcount) {
6111 ele = ln->obj;
6112 addReplyBulk(c,ele);
6113 if (withscores)
6114 addReplyDouble(c,ln->score);
6115 }
6116 ln = ln->forward[0];
6117 rangelen++;
6118 if (limit > 0) limit--;
6119 }
6120 if (justcount) {
6121 addReplyLong(c,(long)rangelen);
6122 } else {
6123 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6124 withscores ? (rangelen*2) : rangelen);
6125 }
6126 }
6127 }
6128 }
6129
6130 static void zrangebyscoreCommand(redisClient *c) {
6131 genericZrangebyscoreCommand(c,0);
6132 }
6133
6134 static void zcountCommand(redisClient *c) {
6135 genericZrangebyscoreCommand(c,1);
6136 }
6137
6138 static void zcardCommand(redisClient *c) {
6139 robj *o;
6140 zset *zs;
6141
6142 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6143 checkType(c,o,REDIS_ZSET)) return;
6144
6145 zs = o->ptr;
6146 addReplyUlong(c,zs->zsl->length);
6147 }
6148
6149 static void zscoreCommand(redisClient *c) {
6150 robj *o;
6151 zset *zs;
6152 dictEntry *de;
6153
6154 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6155 checkType(c,o,REDIS_ZSET)) return;
6156
6157 zs = o->ptr;
6158 de = dictFind(zs->dict,c->argv[2]);
6159 if (!de) {
6160 addReply(c,shared.nullbulk);
6161 } else {
6162 double *score = dictGetEntryVal(de);
6163
6164 addReplyDouble(c,*score);
6165 }
6166 }
6167
6168 static void zrankGenericCommand(redisClient *c, int reverse) {
6169 robj *o;
6170 zset *zs;
6171 zskiplist *zsl;
6172 dictEntry *de;
6173 unsigned long rank;
6174 double *score;
6175
6176 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6177 checkType(c,o,REDIS_ZSET)) return;
6178
6179 zs = o->ptr;
6180 zsl = zs->zsl;
6181 de = dictFind(zs->dict,c->argv[2]);
6182 if (!de) {
6183 addReply(c,shared.nullbulk);
6184 return;
6185 }
6186
6187 score = dictGetEntryVal(de);
6188 rank = zslGetRank(zsl, *score, c->argv[2]);
6189 if (rank) {
6190 if (reverse) {
6191 addReplyLong(c, zsl->length - rank);
6192 } else {
6193 addReplyLong(c, rank-1);
6194 }
6195 } else {
6196 addReply(c,shared.nullbulk);
6197 }
6198 }
6199
6200 static void zrankCommand(redisClient *c) {
6201 zrankGenericCommand(c, 0);
6202 }
6203
6204 static void zrevrankCommand(redisClient *c) {
6205 zrankGenericCommand(c, 1);
6206 }
6207
6208 /* ========================= Hashes utility functions ======================= */
6209 #define REDIS_HASH_KEY 1
6210 #define REDIS_HASH_VALUE 2
6211
6212 /* Check the length of a number of objects to see if we need to convert a
6213 * zipmap to a real hash. Note that we only check string encoded objects
6214 * as their string length can be queried in constant time. */
6215 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6216 int i;
6217 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6218
6219 for (i = start; i <= end; i++) {
6220 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6221 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6222 {
6223 convertToRealHash(subject);
6224 return;
6225 }
6226 }
6227 }
6228
6229 /* Encode given objects in-place when the hash uses a dict. */
6230 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6231 if (subject->encoding == REDIS_ENCODING_HT) {
6232 if (o1) *o1 = tryObjectEncoding(*o1);
6233 if (o2) *o2 = tryObjectEncoding(*o2);
6234 }
6235 }
6236
6237 /* Get the value from a hash identified by key. Returns either a string
6238 * object or NULL if the value cannot be found. The refcount of the object
6239 * is always increased by 1 when the value was found. */
6240 static robj *hashGet(robj *o, robj *key) {
6241 robj *value = NULL;
6242 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6243 unsigned char *v;
6244 unsigned int vlen;
6245 key = getDecodedObject(key);
6246 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6247 value = createStringObject((char*)v,vlen);
6248 }
6249 decrRefCount(key);
6250 } else {
6251 dictEntry *de = dictFind(o->ptr,key);
6252 if (de != NULL) {
6253 value = dictGetEntryVal(de);
6254 incrRefCount(value);
6255 }
6256 }
6257 return value;
6258 }
6259
6260 /* Test if the key exists in the given hash. Returns 1 if the key
6261 * exists and 0 when it doesn't. */
6262 static int hashExists(robj *o, robj *key) {
6263 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6264 key = getDecodedObject(key);
6265 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6266 decrRefCount(key);
6267 return 1;
6268 }
6269 decrRefCount(key);
6270 } else {
6271 if (dictFind(o->ptr,key) != NULL) {
6272 return 1;
6273 }
6274 }
6275 return 0;
6276 }
6277
6278 /* Add an element, discard the old if the key already exists.
6279 * Return 0 on insert and 1 on update. */
6280 static int hashSet(robj *o, robj *key, robj *value) {
6281 int update = 0;
6282 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6283 key = getDecodedObject(key);
6284 value = getDecodedObject(value);
6285 o->ptr = zipmapSet(o->ptr,
6286 key->ptr,sdslen(key->ptr),
6287 value->ptr,sdslen(value->ptr), &update);
6288 decrRefCount(key);
6289 decrRefCount(value);
6290
6291 /* Check if the zipmap needs to be upgraded to a real hash table */
6292 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6293 convertToRealHash(o);
6294 } else {
6295 if (dictReplace(o->ptr,key,value)) {
6296 /* Insert */
6297 incrRefCount(key);
6298 } else {
6299 /* Update */
6300 update = 1;
6301 }
6302 incrRefCount(value);
6303 }
6304 return update;
6305 }
6306
6307 /* Delete an element from a hash.
6308 * Return 1 on deleted and 0 on not found. */
6309 static int hashDelete(robj *o, robj *key) {
6310 int deleted = 0;
6311 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6312 key = getDecodedObject(key);
6313 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6314 decrRefCount(key);
6315 } else {
6316 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6317 /* Always check if the dictionary needs a resize after a delete. */
6318 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6319 }
6320 return deleted;
6321 }
6322
6323 /* Return the number of elements in a hash. */
6324 static unsigned long hashLength(robj *o) {
6325 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6326 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6327 }
6328
6329 /* Structure to hold hash iteration abstration. Note that iteration over
6330 * hashes involves both fields and values. Because it is possible that
6331 * not both are required, store pointers in the iterator to avoid
6332 * unnecessary memory allocation for fields/values. */
6333 typedef struct {
6334 int encoding;
6335 unsigned char *zi;
6336 unsigned char *zk, *zv;
6337 unsigned int zklen, zvlen;
6338
6339 dictIterator *di;
6340 dictEntry *de;
6341 } hashIterator;
6342
6343 static hashIterator *hashInitIterator(robj *subject) {
6344 hashIterator *hi = zmalloc(sizeof(hashIterator));
6345 hi->encoding = subject->encoding;
6346 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6347 hi->zi = zipmapRewind(subject->ptr);
6348 } else if (hi->encoding == REDIS_ENCODING_HT) {
6349 hi->di = dictGetIterator(subject->ptr);
6350 } else {
6351 redisAssert(NULL);
6352 }
6353 return hi;
6354 }
6355
6356 static void hashReleaseIterator(hashIterator *hi) {
6357 if (hi->encoding == REDIS_ENCODING_HT) {
6358 dictReleaseIterator(hi->di);
6359 }
6360 zfree(hi);
6361 }
6362
6363 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6364 * could be found and REDIS_ERR when the iterator reaches the end. */
6365 static int hashNext(hashIterator *hi) {
6366 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6367 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6368 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6369 } else {
6370 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6371 }
6372 return REDIS_OK;
6373 }
6374
6375 /* Get key or value object at current iteration position.
6376 * This increases the refcount of the field object by 1. */
6377 static robj *hashCurrent(hashIterator *hi, int what) {
6378 robj *o;
6379 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6380 if (what & REDIS_HASH_KEY) {
6381 o = createStringObject((char*)hi->zk,hi->zklen);
6382 } else {
6383 o = createStringObject((char*)hi->zv,hi->zvlen);
6384 }
6385 } else {
6386 if (what & REDIS_HASH_KEY) {
6387 o = dictGetEntryKey(hi->de);
6388 } else {
6389 o = dictGetEntryVal(hi->de);
6390 }
6391 incrRefCount(o);
6392 }
6393 return o;
6394 }
6395
6396 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6397 robj *o = lookupKeyWrite(c->db,key);
6398 if (o == NULL) {
6399 o = createHashObject();
6400 dictAdd(c->db->dict,key,o);
6401 incrRefCount(key);
6402 } else {
6403 if (o->type != REDIS_HASH) {
6404 addReply(c,shared.wrongtypeerr);
6405 return NULL;
6406 }
6407 }
6408 return o;
6409 }
6410
6411 /* ============================= Hash commands ============================== */
6412 static void hsetCommand(redisClient *c) {
6413 int update;
6414 robj *o;
6415
6416 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6417 hashTryConversion(o,c->argv,2,3);
6418 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6419 update = hashSet(o,c->argv[2],c->argv[3]);
6420 addReply(c, update ? shared.czero : shared.cone);
6421 server.dirty++;
6422 }
6423
6424 static void hsetnxCommand(redisClient *c) {
6425 robj *o;
6426 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6427 hashTryConversion(o,c->argv,2,3);
6428
6429 if (hashExists(o, c->argv[2])) {
6430 addReply(c, shared.czero);
6431 } else {
6432 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6433 hashSet(o,c->argv[2],c->argv[3]);
6434 addReply(c, shared.cone);
6435 server.dirty++;
6436 }
6437 }
6438
6439 static void hmsetCommand(redisClient *c) {
6440 int i;
6441 robj *o;
6442
6443 if ((c->argc % 2) == 1) {
6444 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6445 return;
6446 }
6447
6448 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6449 hashTryConversion(o,c->argv,2,c->argc-1);
6450 for (i = 2; i < c->argc; i += 2) {
6451 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6452 hashSet(o,c->argv[i],c->argv[i+1]);
6453 }
6454 addReply(c, shared.ok);
6455 server.dirty++;
6456 }
6457
6458 static void hincrbyCommand(redisClient *c) {
6459 long long value, incr;
6460 robj *o, *current, *new;
6461
6462 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6463 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6464 if ((current = hashGet(o,c->argv[2])) != NULL) {
6465 if (getLongLongFromObjectOrReply(c,current,&value,
6466 "hash value is not an integer") != REDIS_OK) {
6467 decrRefCount(current);
6468 return;
6469 }
6470 decrRefCount(current);
6471 } else {
6472 value = 0;
6473 }
6474
6475 value += incr;
6476 new = createStringObjectFromLongLong(value);
6477 hashTryObjectEncoding(o,&c->argv[2],NULL);
6478 hashSet(o,c->argv[2],new);
6479 decrRefCount(new);
6480 addReplyLongLong(c,value);
6481 server.dirty++;
6482 }
6483
6484 static void hgetCommand(redisClient *c) {
6485 robj *o, *value;
6486 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6487 checkType(c,o,REDIS_HASH)) return;
6488
6489 if ((value = hashGet(o,c->argv[2])) != NULL) {
6490 addReplyBulk(c,value);
6491 decrRefCount(value);
6492 } else {
6493 addReply(c,shared.nullbulk);
6494 }
6495 }
6496
6497 static void hmgetCommand(redisClient *c) {
6498 int i;
6499 robj *o, *value;
6500 o = lookupKeyRead(c->db,c->argv[1]);
6501 if (o != NULL && o->type != REDIS_HASH) {
6502 addReply(c,shared.wrongtypeerr);
6503 }
6504
6505 /* Note the check for o != NULL happens inside the loop. This is
6506 * done because objects that cannot be found are considered to be
6507 * an empty hash. The reply should then be a series of NULLs. */
6508 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6509 for (i = 2; i < c->argc; i++) {
6510 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6511 addReplyBulk(c,value);
6512 decrRefCount(value);
6513 } else {
6514 addReply(c,shared.nullbulk);
6515 }
6516 }
6517 }
6518
6519 static void hdelCommand(redisClient *c) {
6520 robj *o;
6521 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6522 checkType(c,o,REDIS_HASH)) return;
6523
6524 if (hashDelete(o,c->argv[2])) {
6525 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6526 addReply(c,shared.cone);
6527 server.dirty++;
6528 } else {
6529 addReply(c,shared.czero);
6530 }
6531 }
6532
6533 static void hlenCommand(redisClient *c) {
6534 robj *o;
6535 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6536 checkType(c,o,REDIS_HASH)) return;
6537
6538 addReplyUlong(c,hashLength(o));
6539 }
6540
6541 static void genericHgetallCommand(redisClient *c, int flags) {
6542 robj *o, *lenobj, *obj;
6543 unsigned long count = 0;
6544 hashIterator *hi;
6545
6546 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6547 || checkType(c,o,REDIS_HASH)) return;
6548
6549 lenobj = createObject(REDIS_STRING,NULL);
6550 addReply(c,lenobj);
6551 decrRefCount(lenobj);
6552
6553 hi = hashInitIterator(o);
6554 while (hashNext(hi) != REDIS_ERR) {
6555 if (flags & REDIS_HASH_KEY) {
6556 obj = hashCurrent(hi,REDIS_HASH_KEY);
6557 addReplyBulk(c,obj);
6558 decrRefCount(obj);
6559 count++;
6560 }
6561 if (flags & REDIS_HASH_VALUE) {
6562 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6563 addReplyBulk(c,obj);
6564 decrRefCount(obj);
6565 count++;
6566 }
6567 }
6568 hashReleaseIterator(hi);
6569
6570 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6571 }
6572
6573 static void hkeysCommand(redisClient *c) {
6574 genericHgetallCommand(c,REDIS_HASH_KEY);
6575 }
6576
6577 static void hvalsCommand(redisClient *c) {
6578 genericHgetallCommand(c,REDIS_HASH_VALUE);
6579 }
6580
6581 static void hgetallCommand(redisClient *c) {
6582 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6583 }
6584
6585 static void hexistsCommand(redisClient *c) {
6586 robj *o;
6587 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6588 checkType(c,o,REDIS_HASH)) return;
6589
6590 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6591 }
6592
6593 static void convertToRealHash(robj *o) {
6594 unsigned char *key, *val, *p, *zm = o->ptr;
6595 unsigned int klen, vlen;
6596 dict *dict = dictCreate(&hashDictType,NULL);
6597
6598 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6599 p = zipmapRewind(zm);
6600 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6601 robj *keyobj, *valobj;
6602
6603 keyobj = createStringObject((char*)key,klen);
6604 valobj = createStringObject((char*)val,vlen);
6605 keyobj = tryObjectEncoding(keyobj);
6606 valobj = tryObjectEncoding(valobj);
6607 dictAdd(dict,keyobj,valobj);
6608 }
6609 o->encoding = REDIS_ENCODING_HT;
6610 o->ptr = dict;
6611 zfree(zm);
6612 }
6613
6614 /* ========================= Non type-specific commands ==================== */
6615
6616 static void flushdbCommand(redisClient *c) {
6617 server.dirty += dictSize(c->db->dict);
6618 dictEmpty(c->db->dict);
6619 dictEmpty(c->db->expires);
6620 addReply(c,shared.ok);
6621 }
6622
6623 static void flushallCommand(redisClient *c) {
6624 server.dirty += emptyDb();
6625 addReply(c,shared.ok);
6626 if (server.bgsavechildpid != -1) {
6627 kill(server.bgsavechildpid,SIGKILL);
6628 rdbRemoveTempFile(server.bgsavechildpid);
6629 }
6630 rdbSave(server.dbfilename);
6631 server.dirty++;
6632 }
6633
6634 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6635 redisSortOperation *so = zmalloc(sizeof(*so));
6636 so->type = type;
6637 so->pattern = pattern;
6638 return so;
6639 }
6640
6641 /* Return the value associated to the key with a name obtained
6642 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6643 * The returned object will always have its refcount increased by 1
6644 * when it is non-NULL. */
6645 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6646 char *p, *f;
6647 sds spat, ssub;
6648 robj keyobj, fieldobj, *o;
6649 int prefixlen, sublen, postfixlen, fieldlen;
6650 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6651 struct {
6652 long len;
6653 long free;
6654 char buf[REDIS_SORTKEY_MAX+1];
6655 } keyname, fieldname;
6656
6657 /* If the pattern is "#" return the substitution object itself in order
6658 * to implement the "SORT ... GET #" feature. */
6659 spat = pattern->ptr;
6660 if (spat[0] == '#' && spat[1] == '\0') {
6661 incrRefCount(subst);
6662 return subst;
6663 }
6664
6665 /* The substitution object may be specially encoded. If so we create
6666 * a decoded object on the fly. Otherwise getDecodedObject will just
6667 * increment the ref count, that we'll decrement later. */
6668 subst = getDecodedObject(subst);
6669
6670 ssub = subst->ptr;
6671 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6672 p = strchr(spat,'*');
6673 if (!p) {
6674 decrRefCount(subst);
6675 return NULL;
6676 }
6677
6678 /* Find out if we're dealing with a hash dereference. */
6679 if ((f = strstr(p+1, "->")) != NULL) {
6680 fieldlen = sdslen(spat)-(f-spat);
6681 /* this also copies \0 character */
6682 memcpy(fieldname.buf,f+2,fieldlen-1);
6683 fieldname.len = fieldlen-2;
6684 } else {
6685 fieldlen = 0;
6686 }
6687
6688 prefixlen = p-spat;
6689 sublen = sdslen(ssub);
6690 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6691 memcpy(keyname.buf,spat,prefixlen);
6692 memcpy(keyname.buf+prefixlen,ssub,sublen);
6693 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6694 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6695 keyname.len = prefixlen+sublen+postfixlen;
6696 decrRefCount(subst);
6697
6698 /* Lookup substituted key */
6699 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6700 o = lookupKeyRead(db,&keyobj);
6701 if (o == NULL) return NULL;
6702
6703 if (fieldlen > 0) {
6704 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6705
6706 /* Retrieve value from hash by the field name. This operation
6707 * already increases the refcount of the returned object. */
6708 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6709 o = hashGet(o, &fieldobj);
6710 } else {
6711 if (o->type != REDIS_STRING) return NULL;
6712
6713 /* Every object that this function returns needs to have its refcount
6714 * increased. sortCommand decreases it again. */
6715 incrRefCount(o);
6716 }
6717
6718 return o;
6719 }
6720
6721 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6722 * the additional parameter is not standard but a BSD-specific we have to
6723 * pass sorting parameters via the global 'server' structure */
6724 static int sortCompare(const void *s1, const void *s2) {
6725 const redisSortObject *so1 = s1, *so2 = s2;
6726 int cmp;
6727
6728 if (!server.sort_alpha) {
6729 /* Numeric sorting. Here it's trivial as we precomputed scores */
6730 if (so1->u.score > so2->u.score) {
6731 cmp = 1;
6732 } else if (so1->u.score < so2->u.score) {
6733 cmp = -1;
6734 } else {
6735 cmp = 0;
6736 }
6737 } else {
6738 /* Alphanumeric sorting */
6739 if (server.sort_bypattern) {
6740 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6741 /* At least one compare object is NULL */
6742 if (so1->u.cmpobj == so2->u.cmpobj)
6743 cmp = 0;
6744 else if (so1->u.cmpobj == NULL)
6745 cmp = -1;
6746 else
6747 cmp = 1;
6748 } else {
6749 /* We have both the objects, use strcoll */
6750 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6751 }
6752 } else {
6753 /* Compare elements directly. */
6754 cmp = compareStringObjects(so1->obj,so2->obj);
6755 }
6756 }
6757 return server.sort_desc ? -cmp : cmp;
6758 }
6759
6760 /* The SORT command is the most complex command in Redis. Warning: this code
6761 * is optimized for speed and a bit less for readability */
6762 static void sortCommand(redisClient *c) {
6763 list *operations;
6764 int outputlen = 0;
6765 int desc = 0, alpha = 0;
6766 int limit_start = 0, limit_count = -1, start, end;
6767 int j, dontsort = 0, vectorlen;
6768 int getop = 0; /* GET operation counter */
6769 robj *sortval, *sortby = NULL, *storekey = NULL;
6770 redisSortObject *vector; /* Resulting vector to sort */
6771
6772 /* Lookup the key to sort. It must be of the right types */
6773 sortval = lookupKeyRead(c->db,c->argv[1]);
6774 if (sortval == NULL) {
6775 addReply(c,shared.emptymultibulk);
6776 return;
6777 }
6778 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6779 sortval->type != REDIS_ZSET)
6780 {
6781 addReply(c,shared.wrongtypeerr);
6782 return;
6783 }
6784
6785 /* Create a list of operations to perform for every sorted element.
6786 * Operations can be GET/DEL/INCR/DECR */
6787 operations = listCreate();
6788 listSetFreeMethod(operations,zfree);
6789 j = 2;
6790
6791 /* Now we need to protect sortval incrementing its count, in the future
6792 * SORT may have options able to overwrite/delete keys during the sorting
6793 * and the sorted key itself may get destroied */
6794 incrRefCount(sortval);
6795
6796 /* The SORT command has an SQL-alike syntax, parse it */
6797 while(j < c->argc) {
6798 int leftargs = c->argc-j-1;
6799 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6800 desc = 0;
6801 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6802 desc = 1;
6803 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6804 alpha = 1;
6805 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6806 limit_start = atoi(c->argv[j+1]->ptr);
6807 limit_count = atoi(c->argv[j+2]->ptr);
6808 j+=2;
6809 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6810 storekey = c->argv[j+1];
6811 j++;
6812 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6813 sortby = c->argv[j+1];
6814 /* If the BY pattern does not contain '*', i.e. it is constant,
6815 * we don't need to sort nor to lookup the weight keys. */
6816 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6817 j++;
6818 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6819 listAddNodeTail(operations,createSortOperation(
6820 REDIS_SORT_GET,c->argv[j+1]));
6821 getop++;
6822 j++;
6823 } else {
6824 decrRefCount(sortval);
6825 listRelease(operations);
6826 addReply(c,shared.syntaxerr);
6827 return;
6828 }
6829 j++;
6830 }
6831
6832 /* Load the sorting vector with all the objects to sort */
6833 switch(sortval->type) {
6834 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6835 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6836 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6837 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6838 }
6839 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6840 j = 0;
6841
6842 if (sortval->type == REDIS_LIST) {
6843 list *list = sortval->ptr;
6844 listNode *ln;
6845 listIter li;
6846
6847 listRewind(list,&li);
6848 while((ln = listNext(&li))) {
6849 robj *ele = ln->value;
6850 vector[j].obj = ele;
6851 vector[j].u.score = 0;
6852 vector[j].u.cmpobj = NULL;
6853 j++;
6854 }
6855 } else {
6856 dict *set;
6857 dictIterator *di;
6858 dictEntry *setele;
6859
6860 if (sortval->type == REDIS_SET) {
6861 set = sortval->ptr;
6862 } else {
6863 zset *zs = sortval->ptr;
6864 set = zs->dict;
6865 }
6866
6867 di = dictGetIterator(set);
6868 while((setele = dictNext(di)) != NULL) {
6869 vector[j].obj = dictGetEntryKey(setele);
6870 vector[j].u.score = 0;
6871 vector[j].u.cmpobj = NULL;
6872 j++;
6873 }
6874 dictReleaseIterator(di);
6875 }
6876 redisAssert(j == vectorlen);
6877
6878 /* Now it's time to load the right scores in the sorting vector */
6879 if (dontsort == 0) {
6880 for (j = 0; j < vectorlen; j++) {
6881 robj *byval;
6882 if (sortby) {
6883 /* lookup value to sort by */
6884 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6885 if (!byval) continue;
6886 } else {
6887 /* use object itself to sort by */
6888 byval = vector[j].obj;
6889 }
6890
6891 if (alpha) {
6892 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6893 } else {
6894 if (byval->encoding == REDIS_ENCODING_RAW) {
6895 vector[j].u.score = strtod(byval->ptr,NULL);
6896 } else if (byval->encoding == REDIS_ENCODING_INT) {
6897 /* Don't need to decode the object if it's
6898 * integer-encoded (the only encoding supported) so
6899 * far. We can just cast it */
6900 vector[j].u.score = (long)byval->ptr;
6901 } else {
6902 redisAssert(1 != 1);
6903 }
6904 }
6905
6906 /* when the object was retrieved using lookupKeyByPattern,
6907 * its refcount needs to be decreased. */
6908 if (sortby) {
6909 decrRefCount(byval);
6910 }
6911 }
6912 }
6913
6914 /* We are ready to sort the vector... perform a bit of sanity check
6915 * on the LIMIT option too. We'll use a partial version of quicksort. */
6916 start = (limit_start < 0) ? 0 : limit_start;
6917 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6918 if (start >= vectorlen) {
6919 start = vectorlen-1;
6920 end = vectorlen-2;
6921 }
6922 if (end >= vectorlen) end = vectorlen-1;
6923
6924 if (dontsort == 0) {
6925 server.sort_desc = desc;
6926 server.sort_alpha = alpha;
6927 server.sort_bypattern = sortby ? 1 : 0;
6928 if (sortby && (start != 0 || end != vectorlen-1))
6929 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6930 else
6931 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6932 }
6933
6934 /* Send command output to the output buffer, performing the specified
6935 * GET/DEL/INCR/DECR operations if any. */
6936 outputlen = getop ? getop*(end-start+1) : end-start+1;
6937 if (storekey == NULL) {
6938 /* STORE option not specified, sent the sorting result to client */
6939 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6940 for (j = start; j <= end; j++) {
6941 listNode *ln;
6942 listIter li;
6943
6944 if (!getop) addReplyBulk(c,vector[j].obj);
6945 listRewind(operations,&li);
6946 while((ln = listNext(&li))) {
6947 redisSortOperation *sop = ln->value;
6948 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6949 vector[j].obj);
6950
6951 if (sop->type == REDIS_SORT_GET) {
6952 if (!val) {
6953 addReply(c,shared.nullbulk);
6954 } else {
6955 addReplyBulk(c,val);
6956 decrRefCount(val);
6957 }
6958 } else {
6959 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6960 }
6961 }
6962 }
6963 } else {
6964 robj *listObject = createListObject();
6965 list *listPtr = (list*) listObject->ptr;
6966
6967 /* STORE option specified, set the sorting result as a List object */
6968 for (j = start; j <= end; j++) {
6969 listNode *ln;
6970 listIter li;
6971
6972 if (!getop) {
6973 listAddNodeTail(listPtr,vector[j].obj);
6974 incrRefCount(vector[j].obj);
6975 }
6976 listRewind(operations,&li);
6977 while((ln = listNext(&li))) {
6978 redisSortOperation *sop = ln->value;
6979 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6980 vector[j].obj);
6981
6982 if (sop->type == REDIS_SORT_GET) {
6983 if (!val) {
6984 listAddNodeTail(listPtr,createStringObject("",0));
6985 } else {
6986 /* We should do a incrRefCount on val because it is
6987 * added to the list, but also a decrRefCount because
6988 * it is returned by lookupKeyByPattern. This results
6989 * in doing nothing at all. */
6990 listAddNodeTail(listPtr,val);
6991 }
6992 } else {
6993 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6994 }
6995 }
6996 }
6997 if (dictReplace(c->db->dict,storekey,listObject)) {
6998 incrRefCount(storekey);
6999 }
7000 /* Note: we add 1 because the DB is dirty anyway since even if the
7001 * SORT result is empty a new key is set and maybe the old content
7002 * replaced. */
7003 server.dirty += 1+outputlen;
7004 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
7005 }
7006
7007 /* Cleanup */
7008 decrRefCount(sortval);
7009 listRelease(operations);
7010 for (j = 0; j < vectorlen; j++) {
7011 if (alpha && vector[j].u.cmpobj)
7012 decrRefCount(vector[j].u.cmpobj);
7013 }
7014 zfree(vector);
7015 }
7016
7017 /* Convert an amount of bytes into a human readable string in the form
7018 * of 100B, 2G, 100M, 4K, and so forth. */
7019 static void bytesToHuman(char *s, unsigned long long n) {
7020 double d;
7021
7022 if (n < 1024) {
7023 /* Bytes */
7024 sprintf(s,"%lluB",n);
7025 return;
7026 } else if (n < (1024*1024)) {
7027 d = (double)n/(1024);
7028 sprintf(s,"%.2fK",d);
7029 } else if (n < (1024LL*1024*1024)) {
7030 d = (double)n/(1024*1024);
7031 sprintf(s,"%.2fM",d);
7032 } else if (n < (1024LL*1024*1024*1024)) {
7033 d = (double)n/(1024LL*1024*1024);
7034 sprintf(s,"%.2fG",d);
7035 }
7036 }
7037
7038 /* Create the string returned by the INFO command. This is decoupled
7039 * by the INFO command itself as we need to report the same information
7040 * on memory corruption problems. */
7041 static sds genRedisInfoString(void) {
7042 sds info;
7043 time_t uptime = time(NULL)-server.stat_starttime;
7044 int j;
7045 char hmem[64];
7046
7047 bytesToHuman(hmem,zmalloc_used_memory());
7048 info = sdscatprintf(sdsempty(),
7049 "redis_version:%s\r\n"
7050 "arch_bits:%s\r\n"
7051 "multiplexing_api:%s\r\n"
7052 "process_id:%ld\r\n"
7053 "uptime_in_seconds:%ld\r\n"
7054 "uptime_in_days:%ld\r\n"
7055 "connected_clients:%d\r\n"
7056 "connected_slaves:%d\r\n"
7057 "blocked_clients:%d\r\n"
7058 "used_memory:%zu\r\n"
7059 "used_memory_human:%s\r\n"
7060 "changes_since_last_save:%lld\r\n"
7061 "bgsave_in_progress:%d\r\n"
7062 "last_save_time:%ld\r\n"
7063 "bgrewriteaof_in_progress:%d\r\n"
7064 "total_connections_received:%lld\r\n"
7065 "total_commands_processed:%lld\r\n"
7066 "expired_keys:%lld\r\n"
7067 "hash_max_zipmap_entries:%ld\r\n"
7068 "hash_max_zipmap_value:%ld\r\n"
7069 "pubsub_channels:%ld\r\n"
7070 "pubsub_patterns:%u\r\n"
7071 "vm_enabled:%d\r\n"
7072 "role:%s\r\n"
7073 ,REDIS_VERSION,
7074 (sizeof(long) == 8) ? "64" : "32",
7075 aeGetApiName(),
7076 (long) getpid(),
7077 uptime,
7078 uptime/(3600*24),
7079 listLength(server.clients)-listLength(server.slaves),
7080 listLength(server.slaves),
7081 server.blpop_blocked_clients,
7082 zmalloc_used_memory(),
7083 hmem,
7084 server.dirty,
7085 server.bgsavechildpid != -1,
7086 server.lastsave,
7087 server.bgrewritechildpid != -1,
7088 server.stat_numconnections,
7089 server.stat_numcommands,
7090 server.stat_expiredkeys,
7091 server.hash_max_zipmap_entries,
7092 server.hash_max_zipmap_value,
7093 dictSize(server.pubsub_channels),
7094 listLength(server.pubsub_patterns),
7095 server.vm_enabled != 0,
7096 server.masterhost == NULL ? "master" : "slave"
7097 );
7098 if (server.masterhost) {
7099 info = sdscatprintf(info,
7100 "master_host:%s\r\n"
7101 "master_port:%d\r\n"
7102 "master_link_status:%s\r\n"
7103 "master_last_io_seconds_ago:%d\r\n"
7104 ,server.masterhost,
7105 server.masterport,
7106 (server.replstate == REDIS_REPL_CONNECTED) ?
7107 "up" : "down",
7108 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7109 );
7110 }
7111 if (server.vm_enabled) {
7112 lockThreadedIO();
7113 info = sdscatprintf(info,
7114 "vm_conf_max_memory:%llu\r\n"
7115 "vm_conf_page_size:%llu\r\n"
7116 "vm_conf_pages:%llu\r\n"
7117 "vm_stats_used_pages:%llu\r\n"
7118 "vm_stats_swapped_objects:%llu\r\n"
7119 "vm_stats_swappin_count:%llu\r\n"
7120 "vm_stats_swappout_count:%llu\r\n"
7121 "vm_stats_io_newjobs_len:%lu\r\n"
7122 "vm_stats_io_processing_len:%lu\r\n"
7123 "vm_stats_io_processed_len:%lu\r\n"
7124 "vm_stats_io_active_threads:%lu\r\n"
7125 "vm_stats_blocked_clients:%lu\r\n"
7126 ,(unsigned long long) server.vm_max_memory,
7127 (unsigned long long) server.vm_page_size,
7128 (unsigned long long) server.vm_pages,
7129 (unsigned long long) server.vm_stats_used_pages,
7130 (unsigned long long) server.vm_stats_swapped_objects,
7131 (unsigned long long) server.vm_stats_swapins,
7132 (unsigned long long) server.vm_stats_swapouts,
7133 (unsigned long) listLength(server.io_newjobs),
7134 (unsigned long) listLength(server.io_processing),
7135 (unsigned long) listLength(server.io_processed),
7136 (unsigned long) server.io_active_threads,
7137 (unsigned long) server.vm_blocked_clients
7138 );
7139 unlockThreadedIO();
7140 }
7141 for (j = 0; j < server.dbnum; j++) {
7142 long long keys, vkeys;
7143
7144 keys = dictSize(server.db[j].dict);
7145 vkeys = dictSize(server.db[j].expires);
7146 if (keys || vkeys) {
7147 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7148 j, keys, vkeys);
7149 }
7150 }
7151 return info;
7152 }
7153
7154 static void infoCommand(redisClient *c) {
7155 sds info = genRedisInfoString();
7156 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7157 (unsigned long)sdslen(info)));
7158 addReplySds(c,info);
7159 addReply(c,shared.crlf);
7160 }
7161
7162 static void monitorCommand(redisClient *c) {
7163 /* ignore MONITOR if aleady slave or in monitor mode */
7164 if (c->flags & REDIS_SLAVE) return;
7165
7166 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7167 c->slaveseldb = 0;
7168 listAddNodeTail(server.monitors,c);
7169 addReply(c,shared.ok);
7170 }
7171
7172 /* ================================= Expire ================================= */
7173 static int removeExpire(redisDb *db, robj *key) {
7174 if (dictDelete(db->expires,key) == DICT_OK) {
7175 return 1;
7176 } else {
7177 return 0;
7178 }
7179 }
7180
7181 static int setExpire(redisDb *db, robj *key, time_t when) {
7182 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7183 return 0;
7184 } else {
7185 incrRefCount(key);
7186 return 1;
7187 }
7188 }
7189
7190 /* Return the expire time of the specified key, or -1 if no expire
7191 * is associated with this key (i.e. the key is non volatile) */
7192 static time_t getExpire(redisDb *db, robj *key) {
7193 dictEntry *de;
7194
7195 /* No expire? return ASAP */
7196 if (dictSize(db->expires) == 0 ||
7197 (de = dictFind(db->expires,key)) == NULL) return -1;
7198
7199 return (time_t) dictGetEntryVal(de);
7200 }
7201
7202 static int expireIfNeeded(redisDb *db, robj *key) {
7203 time_t when;
7204 dictEntry *de;
7205
7206 /* No expire? return ASAP */
7207 if (dictSize(db->expires) == 0 ||
7208 (de = dictFind(db->expires,key)) == NULL) return 0;
7209
7210 /* Lookup the expire */
7211 when = (time_t) dictGetEntryVal(de);
7212 if (time(NULL) <= when) return 0;
7213
7214 /* Delete the key */
7215 dictDelete(db->expires,key);
7216 server.stat_expiredkeys++;
7217 return dictDelete(db->dict,key) == DICT_OK;
7218 }
7219
7220 static int deleteIfVolatile(redisDb *db, robj *key) {
7221 dictEntry *de;
7222
7223 /* No expire? return ASAP */
7224 if (dictSize(db->expires) == 0 ||
7225 (de = dictFind(db->expires,key)) == NULL) return 0;
7226
7227 /* Delete the key */
7228 server.dirty++;
7229 server.stat_expiredkeys++;
7230 dictDelete(db->expires,key);
7231 return dictDelete(db->dict,key) == DICT_OK;
7232 }
7233
7234 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7235 dictEntry *de;
7236 time_t seconds;
7237
7238 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7239
7240 seconds -= offset;
7241
7242 de = dictFind(c->db->dict,key);
7243 if (de == NULL) {
7244 addReply(c,shared.czero);
7245 return;
7246 }
7247 if (seconds <= 0) {
7248 if (deleteKey(c->db,key)) server.dirty++;
7249 addReply(c, shared.cone);
7250 return;
7251 } else {
7252 time_t when = time(NULL)+seconds;
7253 if (setExpire(c->db,key,when)) {
7254 addReply(c,shared.cone);
7255 server.dirty++;
7256 } else {
7257 addReply(c,shared.czero);
7258 }
7259 return;
7260 }
7261 }
7262
7263 static void expireCommand(redisClient *c) {
7264 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7265 }
7266
7267 static void expireatCommand(redisClient *c) {
7268 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7269 }
7270
7271 static void ttlCommand(redisClient *c) {
7272 time_t expire;
7273 int ttl = -1;
7274
7275 expire = getExpire(c->db,c->argv[1]);
7276 if (expire != -1) {
7277 ttl = (int) (expire-time(NULL));
7278 if (ttl < 0) ttl = -1;
7279 }
7280 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7281 }
7282
7283 /* ================================ MULTI/EXEC ============================== */
7284
7285 /* Client state initialization for MULTI/EXEC */
7286 static void initClientMultiState(redisClient *c) {
7287 c->mstate.commands = NULL;
7288 c->mstate.count = 0;
7289 }
7290
7291 /* Release all the resources associated with MULTI/EXEC state */
7292 static void freeClientMultiState(redisClient *c) {
7293 int j;
7294
7295 for (j = 0; j < c->mstate.count; j++) {
7296 int i;
7297 multiCmd *mc = c->mstate.commands+j;
7298
7299 for (i = 0; i < mc->argc; i++)
7300 decrRefCount(mc->argv[i]);
7301 zfree(mc->argv);
7302 }
7303 zfree(c->mstate.commands);
7304 }
7305
7306 /* Add a new command into the MULTI commands queue */
7307 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7308 multiCmd *mc;
7309 int j;
7310
7311 c->mstate.commands = zrealloc(c->mstate.commands,
7312 sizeof(multiCmd)*(c->mstate.count+1));
7313 mc = c->mstate.commands+c->mstate.count;
7314 mc->cmd = cmd;
7315 mc->argc = c->argc;
7316 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7317 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7318 for (j = 0; j < c->argc; j++)
7319 incrRefCount(mc->argv[j]);
7320 c->mstate.count++;
7321 }
7322
7323 static void multiCommand(redisClient *c) {
7324 c->flags |= REDIS_MULTI;
7325 addReply(c,shared.ok);
7326 }
7327
7328 static void discardCommand(redisClient *c) {
7329 if (!(c->flags & REDIS_MULTI)) {
7330 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7331 return;
7332 }
7333
7334 freeClientMultiState(c);
7335 initClientMultiState(c);
7336 c->flags &= (~REDIS_MULTI);
7337 addReply(c,shared.ok);
7338 }
7339
7340 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7341 * implememntation for more information. */
7342 static void execCommandReplicateMulti(redisClient *c) {
7343 struct redisCommand *cmd;
7344 robj *multistring = createStringObject("MULTI",5);
7345
7346 cmd = lookupCommand("multi");
7347 if (server.appendonly)
7348 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7349 if (listLength(server.slaves))
7350 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7351 decrRefCount(multistring);
7352 }
7353
7354 static void execCommand(redisClient *c) {
7355 int j;
7356 robj **orig_argv;
7357 int orig_argc;
7358
7359 if (!(c->flags & REDIS_MULTI)) {
7360 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7361 return;
7362 }
7363
7364 /* Replicate a MULTI request now that we are sure the block is executed.
7365 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7366 * both the AOF and the replication link will have the same consistency
7367 * and atomicity guarantees. */
7368 execCommandReplicateMulti(c);
7369
7370 /* Exec all the queued commands */
7371 orig_argv = c->argv;
7372 orig_argc = c->argc;
7373 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7374 for (j = 0; j < c->mstate.count; j++) {
7375 c->argc = c->mstate.commands[j].argc;
7376 c->argv = c->mstate.commands[j].argv;
7377 call(c,c->mstate.commands[j].cmd);
7378 }
7379 c->argv = orig_argv;
7380 c->argc = orig_argc;
7381 freeClientMultiState(c);
7382 initClientMultiState(c);
7383 c->flags &= (~REDIS_MULTI);
7384 /* Make sure the EXEC command is always replicated / AOF, since we
7385 * always send the MULTI command (we can't know beforehand if the
7386 * next operations will contain at least a modification to the DB). */
7387 server.dirty++;
7388 }
7389
7390 /* =========================== Blocking Operations ========================= */
7391
7392 /* Currently Redis blocking operations support is limited to list POP ops,
7393 * so the current implementation is not fully generic, but it is also not
7394 * completely specific so it will not require a rewrite to support new
7395 * kind of blocking operations in the future.
7396 *
7397 * Still it's important to note that list blocking operations can be already
7398 * used as a notification mechanism in order to implement other blocking
7399 * operations at application level, so there must be a very strong evidence
7400 * of usefulness and generality before new blocking operations are implemented.
7401 *
7402 * This is how the current blocking POP works, we use BLPOP as example:
7403 * - If the user calls BLPOP and the key exists and contains a non empty list
7404 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7405 * if there is not to block.
7406 * - If instead BLPOP is called and the key does not exists or the list is
7407 * empty we need to block. In order to do so we remove the notification for
7408 * new data to read in the client socket (so that we'll not serve new
7409 * requests if the blocking request is not served). Also we put the client
7410 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7411 * blocking for this keys.
7412 * - If a PUSH operation against a key with blocked clients waiting is
7413 * performed, we serve the first in the list: basically instead to push
7414 * the new element inside the list we return it to the (first / oldest)
7415 * blocking client, unblock the client, and remove it form the list.
7416 *
7417 * The above comment and the source code should be enough in order to understand
7418 * the implementation and modify / fix it later.
7419 */
7420
7421 /* Set a client in blocking mode for the specified key, with the specified
7422 * timeout */
7423 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7424 dictEntry *de;
7425 list *l;
7426 int j;
7427
7428 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7429 c->blockingkeysnum = numkeys;
7430 c->blockingto = timeout;
7431 for (j = 0; j < numkeys; j++) {
7432 /* Add the key in the client structure, to map clients -> keys */
7433 c->blockingkeys[j] = keys[j];
7434 incrRefCount(keys[j]);
7435
7436 /* And in the other "side", to map keys -> clients */
7437 de = dictFind(c->db->blockingkeys,keys[j]);
7438 if (de == NULL) {
7439 int retval;
7440
7441 /* For every key we take a list of clients blocked for it */
7442 l = listCreate();
7443 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7444 incrRefCount(keys[j]);
7445 assert(retval == DICT_OK);
7446 } else {
7447 l = dictGetEntryVal(de);
7448 }
7449 listAddNodeTail(l,c);
7450 }
7451 /* Mark the client as a blocked client */
7452 c->flags |= REDIS_BLOCKED;
7453 server.blpop_blocked_clients++;
7454 }
7455
7456 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7457 static void unblockClientWaitingData(redisClient *c) {
7458 dictEntry *de;
7459 list *l;
7460 int j;
7461
7462 assert(c->blockingkeys != NULL);
7463 /* The client may wait for multiple keys, so unblock it for every key. */
7464 for (j = 0; j < c->blockingkeysnum; j++) {
7465 /* Remove this client from the list of clients waiting for this key. */
7466 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7467 assert(de != NULL);
7468 l = dictGetEntryVal(de);
7469 listDelNode(l,listSearchKey(l,c));
7470 /* If the list is empty we need to remove it to avoid wasting memory */
7471 if (listLength(l) == 0)
7472 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7473 decrRefCount(c->blockingkeys[j]);
7474 }
7475 /* Cleanup the client structure */
7476 zfree(c->blockingkeys);
7477 c->blockingkeys = NULL;
7478 c->flags &= (~REDIS_BLOCKED);
7479 server.blpop_blocked_clients--;
7480 /* We want to process data if there is some command waiting
7481 * in the input buffer. Note that this is safe even if
7482 * unblockClientWaitingData() gets called from freeClient() because
7483 * freeClient() will be smart enough to call this function
7484 * *after* c->querybuf was set to NULL. */
7485 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7486 }
7487
7488 /* This should be called from any function PUSHing into lists.
7489 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7490 * 'ele' is the element pushed.
7491 *
7492 * If the function returns 0 there was no client waiting for a list push
7493 * against this key.
7494 *
7495 * If the function returns 1 there was a client waiting for a list push
7496 * against this key, the element was passed to this client thus it's not
7497 * needed to actually add it to the list and the caller should return asap. */
7498 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7499 struct dictEntry *de;
7500 redisClient *receiver;
7501 list *l;
7502 listNode *ln;
7503
7504 de = dictFind(c->db->blockingkeys,key);
7505 if (de == NULL) return 0;
7506 l = dictGetEntryVal(de);
7507 ln = listFirst(l);
7508 assert(ln != NULL);
7509 receiver = ln->value;
7510
7511 addReplySds(receiver,sdsnew("*2\r\n"));
7512 addReplyBulk(receiver,key);
7513 addReplyBulk(receiver,ele);
7514 unblockClientWaitingData(receiver);
7515 return 1;
7516 }
7517
7518 /* Blocking RPOP/LPOP */
7519 static void blockingPopGenericCommand(redisClient *c, int where) {
7520 robj *o;
7521 time_t timeout;
7522 int j;
7523
7524 for (j = 1; j < c->argc-1; j++) {
7525 o = lookupKeyWrite(c->db,c->argv[j]);
7526 if (o != NULL) {
7527 if (o->type != REDIS_LIST) {
7528 addReply(c,shared.wrongtypeerr);
7529 return;
7530 } else {
7531 list *list = o->ptr;
7532 if (listLength(list) != 0) {
7533 /* If the list contains elements fall back to the usual
7534 * non-blocking POP operation */
7535 robj *argv[2], **orig_argv;
7536 int orig_argc;
7537
7538 /* We need to alter the command arguments before to call
7539 * popGenericCommand() as the command takes a single key. */
7540 orig_argv = c->argv;
7541 orig_argc = c->argc;
7542 argv[1] = c->argv[j];
7543 c->argv = argv;
7544 c->argc = 2;
7545
7546 /* Also the return value is different, we need to output
7547 * the multi bulk reply header and the key name. The
7548 * "real" command will add the last element (the value)
7549 * for us. If this souds like an hack to you it's just
7550 * because it is... */
7551 addReplySds(c,sdsnew("*2\r\n"));
7552 addReplyBulk(c,argv[1]);
7553 popGenericCommand(c,where);
7554
7555 /* Fix the client structure with the original stuff */
7556 c->argv = orig_argv;
7557 c->argc = orig_argc;
7558 return;
7559 }
7560 }
7561 }
7562 }
7563 /* If the list is empty or the key does not exists we must block */
7564 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7565 if (timeout > 0) timeout += time(NULL);
7566 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7567 }
7568
7569 static void blpopCommand(redisClient *c) {
7570 blockingPopGenericCommand(c,REDIS_HEAD);
7571 }
7572
7573 static void brpopCommand(redisClient *c) {
7574 blockingPopGenericCommand(c,REDIS_TAIL);
7575 }
7576
7577 /* =============================== Replication ============================= */
7578
7579 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7580 ssize_t nwritten, ret = size;
7581 time_t start = time(NULL);
7582
7583 timeout++;
7584 while(size) {
7585 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7586 nwritten = write(fd,ptr,size);
7587 if (nwritten == -1) return -1;
7588 ptr += nwritten;
7589 size -= nwritten;
7590 }
7591 if ((time(NULL)-start) > timeout) {
7592 errno = ETIMEDOUT;
7593 return -1;
7594 }
7595 }
7596 return ret;
7597 }
7598
7599 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7600 ssize_t nread, totread = 0;
7601 time_t start = time(NULL);
7602
7603 timeout++;
7604 while(size) {
7605 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7606 nread = read(fd,ptr,size);
7607 if (nread == -1) return -1;
7608 ptr += nread;
7609 size -= nread;
7610 totread += nread;
7611 }
7612 if ((time(NULL)-start) > timeout) {
7613 errno = ETIMEDOUT;
7614 return -1;
7615 }
7616 }
7617 return totread;
7618 }
7619
7620 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7621 ssize_t nread = 0;
7622
7623 size--;
7624 while(size) {
7625 char c;
7626
7627 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7628 if (c == '\n') {
7629 *ptr = '\0';
7630 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7631 return nread;
7632 } else {
7633 *ptr++ = c;
7634 *ptr = '\0';
7635 nread++;
7636 }
7637 }
7638 return nread;
7639 }
7640
7641 static void syncCommand(redisClient *c) {
7642 /* ignore SYNC if aleady slave or in monitor mode */
7643 if (c->flags & REDIS_SLAVE) return;
7644
7645 /* SYNC can't be issued when the server has pending data to send to
7646 * the client about already issued commands. We need a fresh reply
7647 * buffer registering the differences between the BGSAVE and the current
7648 * dataset, so that we can copy to other slaves if needed. */
7649 if (listLength(c->reply) != 0) {
7650 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7651 return;
7652 }
7653
7654 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7655 /* Here we need to check if there is a background saving operation
7656 * in progress, or if it is required to start one */
7657 if (server.bgsavechildpid != -1) {
7658 /* Ok a background save is in progress. Let's check if it is a good
7659 * one for replication, i.e. if there is another slave that is
7660 * registering differences since the server forked to save */
7661 redisClient *slave;
7662 listNode *ln;
7663 listIter li;
7664
7665 listRewind(server.slaves,&li);
7666 while((ln = listNext(&li))) {
7667 slave = ln->value;
7668 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7669 }
7670 if (ln) {
7671 /* Perfect, the server is already registering differences for
7672 * another slave. Set the right state, and copy the buffer. */
7673 listRelease(c->reply);
7674 c->reply = listDup(slave->reply);
7675 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7676 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7677 } else {
7678 /* No way, we need to wait for the next BGSAVE in order to
7679 * register differences */
7680 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7681 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7682 }
7683 } else {
7684 /* Ok we don't have a BGSAVE in progress, let's start one */
7685 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7686 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7687 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7688 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7689 return;
7690 }
7691 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7692 }
7693 c->repldbfd = -1;
7694 c->flags |= REDIS_SLAVE;
7695 c->slaveseldb = 0;
7696 listAddNodeTail(server.slaves,c);
7697 return;
7698 }
7699
7700 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7701 redisClient *slave = privdata;
7702 REDIS_NOTUSED(el);
7703 REDIS_NOTUSED(mask);
7704 char buf[REDIS_IOBUF_LEN];
7705 ssize_t nwritten, buflen;
7706
7707 if (slave->repldboff == 0) {
7708 /* Write the bulk write count before to transfer the DB. In theory here
7709 * we don't know how much room there is in the output buffer of the
7710 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7711 * operations) will never be smaller than the few bytes we need. */
7712 sds bulkcount;
7713
7714 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7715 slave->repldbsize);
7716 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7717 {
7718 sdsfree(bulkcount);
7719 freeClient(slave);
7720 return;
7721 }
7722 sdsfree(bulkcount);
7723 }
7724 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7725 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7726 if (buflen <= 0) {
7727 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7728 (buflen == 0) ? "premature EOF" : strerror(errno));
7729 freeClient(slave);
7730 return;
7731 }
7732 if ((nwritten = write(fd,buf,buflen)) == -1) {
7733 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7734 strerror(errno));
7735 freeClient(slave);
7736 return;
7737 }
7738 slave->repldboff += nwritten;
7739 if (slave->repldboff == slave->repldbsize) {
7740 close(slave->repldbfd);
7741 slave->repldbfd = -1;
7742 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7743 slave->replstate = REDIS_REPL_ONLINE;
7744 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7745 sendReplyToClient, slave) == AE_ERR) {
7746 freeClient(slave);
7747 return;
7748 }
7749 addReplySds(slave,sdsempty());
7750 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7751 }
7752 }
7753
7754 /* This function is called at the end of every backgrond saving.
7755 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7756 * otherwise REDIS_ERR is passed to the function.
7757 *
7758 * The goal of this function is to handle slaves waiting for a successful
7759 * background saving in order to perform non-blocking synchronization. */
7760 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7761 listNode *ln;
7762 int startbgsave = 0;
7763 listIter li;
7764
7765 listRewind(server.slaves,&li);
7766 while((ln = listNext(&li))) {
7767 redisClient *slave = ln->value;
7768
7769 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7770 startbgsave = 1;
7771 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7772 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7773 struct redis_stat buf;
7774
7775 if (bgsaveerr != REDIS_OK) {
7776 freeClient(slave);
7777 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7778 continue;
7779 }
7780 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7781 redis_fstat(slave->repldbfd,&buf) == -1) {
7782 freeClient(slave);
7783 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7784 continue;
7785 }
7786 slave->repldboff = 0;
7787 slave->repldbsize = buf.st_size;
7788 slave->replstate = REDIS_REPL_SEND_BULK;
7789 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7790 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7791 freeClient(slave);
7792 continue;
7793 }
7794 }
7795 }
7796 if (startbgsave) {
7797 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7798 listIter li;
7799
7800 listRewind(server.slaves,&li);
7801 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7802 while((ln = listNext(&li))) {
7803 redisClient *slave = ln->value;
7804
7805 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7806 freeClient(slave);
7807 }
7808 }
7809 }
7810 }
7811
7812 static int syncWithMaster(void) {
7813 char buf[1024], tmpfile[256], authcmd[1024];
7814 long dumpsize;
7815 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7816 int dfd, maxtries = 5;
7817
7818 if (fd == -1) {
7819 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7820 strerror(errno));
7821 return REDIS_ERR;
7822 }
7823
7824 /* AUTH with the master if required. */
7825 if(server.masterauth) {
7826 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7827 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7828 close(fd);
7829 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7830 strerror(errno));
7831 return REDIS_ERR;
7832 }
7833 /* Read the AUTH result. */
7834 if (syncReadLine(fd,buf,1024,3600) == -1) {
7835 close(fd);
7836 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7837 strerror(errno));
7838 return REDIS_ERR;
7839 }
7840 if (buf[0] != '+') {
7841 close(fd);
7842 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7843 return REDIS_ERR;
7844 }
7845 }
7846
7847 /* Issue the SYNC command */
7848 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7849 close(fd);
7850 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7851 strerror(errno));
7852 return REDIS_ERR;
7853 }
7854 /* Read the bulk write count */
7855 if (syncReadLine(fd,buf,1024,3600) == -1) {
7856 close(fd);
7857 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7858 strerror(errno));
7859 return REDIS_ERR;
7860 }
7861 if (buf[0] != '$') {
7862 close(fd);
7863 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7864 return REDIS_ERR;
7865 }
7866 dumpsize = strtol(buf+1,NULL,10);
7867 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7868 /* Read the bulk write data on a temp file */
7869 while(maxtries--) {
7870 snprintf(tmpfile,256,
7871 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7872 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7873 if (dfd != -1) break;
7874 sleep(1);
7875 }
7876 if (dfd == -1) {
7877 close(fd);
7878 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7879 return REDIS_ERR;
7880 }
7881 while(dumpsize) {
7882 int nread, nwritten;
7883
7884 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7885 if (nread == -1) {
7886 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7887 strerror(errno));
7888 close(fd);
7889 close(dfd);
7890 return REDIS_ERR;
7891 }
7892 nwritten = write(dfd,buf,nread);
7893 if (nwritten == -1) {
7894 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7895 close(fd);
7896 close(dfd);
7897 return REDIS_ERR;
7898 }
7899 dumpsize -= nread;
7900 }
7901 close(dfd);
7902 if (rename(tmpfile,server.dbfilename) == -1) {
7903 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7904 unlink(tmpfile);
7905 close(fd);
7906 return REDIS_ERR;
7907 }
7908 emptyDb();
7909 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7910 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7911 close(fd);
7912 return REDIS_ERR;
7913 }
7914 server.master = createClient(fd);
7915 server.master->flags |= REDIS_MASTER;
7916 server.master->authenticated = 1;
7917 server.replstate = REDIS_REPL_CONNECTED;
7918 return REDIS_OK;
7919 }
7920
7921 static void slaveofCommand(redisClient *c) {
7922 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7923 !strcasecmp(c->argv[2]->ptr,"one")) {
7924 if (server.masterhost) {
7925 sdsfree(server.masterhost);
7926 server.masterhost = NULL;
7927 if (server.master) freeClient(server.master);
7928 server.replstate = REDIS_REPL_NONE;
7929 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7930 }
7931 } else {
7932 sdsfree(server.masterhost);
7933 server.masterhost = sdsdup(c->argv[1]->ptr);
7934 server.masterport = atoi(c->argv[2]->ptr);
7935 if (server.master) freeClient(server.master);
7936 server.replstate = REDIS_REPL_CONNECT;
7937 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7938 server.masterhost, server.masterport);
7939 }
7940 addReply(c,shared.ok);
7941 }
7942
7943 /* ============================ Maxmemory directive ======================== */
7944
7945 /* Try to free one object form the pre-allocated objects free list.
7946 * This is useful under low mem conditions as by default we take 1 million
7947 * free objects allocated. On success REDIS_OK is returned, otherwise
7948 * REDIS_ERR. */
7949 static int tryFreeOneObjectFromFreelist(void) {
7950 robj *o;
7951
7952 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7953 if (listLength(server.objfreelist)) {
7954 listNode *head = listFirst(server.objfreelist);
7955 o = listNodeValue(head);
7956 listDelNode(server.objfreelist,head);
7957 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7958 zfree(o);
7959 return REDIS_OK;
7960 } else {
7961 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7962 return REDIS_ERR;
7963 }
7964 }
7965
7966 /* This function gets called when 'maxmemory' is set on the config file to limit
7967 * the max memory used by the server, and we are out of memory.
7968 * This function will try to, in order:
7969 *
7970 * - Free objects from the free list
7971 * - Try to remove keys with an EXPIRE set
7972 *
7973 * It is not possible to free enough memory to reach used-memory < maxmemory
7974 * the server will start refusing commands that will enlarge even more the
7975 * memory usage.
7976 */
7977 static void freeMemoryIfNeeded(void) {
7978 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7979 int j, k, freed = 0;
7980
7981 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7982 for (j = 0; j < server.dbnum; j++) {
7983 int minttl = -1;
7984 robj *minkey = NULL;
7985 struct dictEntry *de;
7986
7987 if (dictSize(server.db[j].expires)) {
7988 freed = 1;
7989 /* From a sample of three keys drop the one nearest to
7990 * the natural expire */
7991 for (k = 0; k < 3; k++) {
7992 time_t t;
7993
7994 de = dictGetRandomKey(server.db[j].expires);
7995 t = (time_t) dictGetEntryVal(de);
7996 if (minttl == -1 || t < minttl) {
7997 minkey = dictGetEntryKey(de);
7998 minttl = t;
7999 }
8000 }
8001 deleteKey(server.db+j,minkey);
8002 }
8003 }
8004 if (!freed) return; /* nothing to free... */
8005 }
8006 }
8007
8008 /* ============================== Append Only file ========================== */
8009
8010 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8011 sds buf = sdsempty();
8012 int j;
8013 ssize_t nwritten;
8014 time_t now;
8015 robj *tmpargv[3];
8016
8017 /* The DB this command was targetting is not the same as the last command
8018 * we appendend. To issue a SELECT command is needed. */
8019 if (dictid != server.appendseldb) {
8020 char seldb[64];
8021
8022 snprintf(seldb,sizeof(seldb),"%d",dictid);
8023 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
8024 (unsigned long)strlen(seldb),seldb);
8025 server.appendseldb = dictid;
8026 }
8027
8028 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
8029 * EXPIREs into EXPIREATs calls */
8030 if (cmd->proc == expireCommand) {
8031 long when;
8032
8033 tmpargv[0] = createStringObject("EXPIREAT",8);
8034 tmpargv[1] = argv[1];
8035 incrRefCount(argv[1]);
8036 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
8037 tmpargv[2] = createObject(REDIS_STRING,
8038 sdscatprintf(sdsempty(),"%ld",when));
8039 argv = tmpargv;
8040 }
8041
8042 /* Append the actual command */
8043 buf = sdscatprintf(buf,"*%d\r\n",argc);
8044 for (j = 0; j < argc; j++) {
8045 robj *o = argv[j];
8046
8047 o = getDecodedObject(o);
8048 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8049 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8050 buf = sdscatlen(buf,"\r\n",2);
8051 decrRefCount(o);
8052 }
8053
8054 /* Free the objects from the modified argv for EXPIREAT */
8055 if (cmd->proc == expireCommand) {
8056 for (j = 0; j < 3; j++)
8057 decrRefCount(argv[j]);
8058 }
8059
8060 /* We want to perform a single write. This should be guaranteed atomic
8061 * at least if the filesystem we are writing is a real physical one.
8062 * While this will save us against the server being killed I don't think
8063 * there is much to do about the whole server stopping for power problems
8064 * or alike */
8065 nwritten = write(server.appendfd,buf,sdslen(buf));
8066 if (nwritten != (signed)sdslen(buf)) {
8067 /* Ooops, we are in troubles. The best thing to do for now is
8068 * to simply exit instead to give the illusion that everything is
8069 * working as expected. */
8070 if (nwritten == -1) {
8071 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8072 } else {
8073 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8074 }
8075 exit(1);
8076 }
8077 /* If a background append only file rewriting is in progress we want to
8078 * accumulate the differences between the child DB and the current one
8079 * in a buffer, so that when the child process will do its work we
8080 * can append the differences to the new append only file. */
8081 if (server.bgrewritechildpid != -1)
8082 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8083
8084 sdsfree(buf);
8085 now = time(NULL);
8086 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8087 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8088 now-server.lastfsync > 1))
8089 {
8090 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8091 * flushing metadata. */
8092 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8093 server.lastfsync = now;
8094 }
8095 }
8096
8097 /* In Redis commands are always executed in the context of a client, so in
8098 * order to load the append only file we need to create a fake client. */
8099 static struct redisClient *createFakeClient(void) {
8100 struct redisClient *c = zmalloc(sizeof(*c));
8101
8102 selectDb(c,0);
8103 c->fd = -1;
8104 c->querybuf = sdsempty();
8105 c->argc = 0;
8106 c->argv = NULL;
8107 c->flags = 0;
8108 /* We set the fake client as a slave waiting for the synchronization
8109 * so that Redis will not try to send replies to this client. */
8110 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8111 c->reply = listCreate();
8112 listSetFreeMethod(c->reply,decrRefCount);
8113 listSetDupMethod(c->reply,dupClientReplyValue);
8114 return c;
8115 }
8116
8117 static void freeFakeClient(struct redisClient *c) {
8118 sdsfree(c->querybuf);
8119 listRelease(c->reply);
8120 zfree(c);
8121 }
8122
8123 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8124 * error (the append only file is zero-length) REDIS_ERR is returned. On
8125 * fatal error an error message is logged and the program exists. */
8126 int loadAppendOnlyFile(char *filename) {
8127 struct redisClient *fakeClient;
8128 FILE *fp = fopen(filename,"r");
8129 struct redis_stat sb;
8130 unsigned long long loadedkeys = 0;
8131
8132 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8133 return REDIS_ERR;
8134
8135 if (fp == NULL) {
8136 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8137 exit(1);
8138 }
8139
8140 fakeClient = createFakeClient();
8141 while(1) {
8142 int argc, j;
8143 unsigned long len;
8144 robj **argv;
8145 char buf[128];
8146 sds argsds;
8147 struct redisCommand *cmd;
8148
8149 if (fgets(buf,sizeof(buf),fp) == NULL) {
8150 if (feof(fp))
8151 break;
8152 else
8153 goto readerr;
8154 }
8155 if (buf[0] != '*') goto fmterr;
8156 argc = atoi(buf+1);
8157 argv = zmalloc(sizeof(robj*)*argc);
8158 for (j = 0; j < argc; j++) {
8159 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8160 if (buf[0] != '$') goto fmterr;
8161 len = strtol(buf+1,NULL,10);
8162 argsds = sdsnewlen(NULL,len);
8163 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8164 argv[j] = createObject(REDIS_STRING,argsds);
8165 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8166 }
8167
8168 /* Command lookup */
8169 cmd = lookupCommand(argv[0]->ptr);
8170 if (!cmd) {
8171 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8172 exit(1);
8173 }
8174 /* Try object encoding */
8175 if (cmd->flags & REDIS_CMD_BULK)
8176 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8177 /* Run the command in the context of a fake client */
8178 fakeClient->argc = argc;
8179 fakeClient->argv = argv;
8180 cmd->proc(fakeClient);
8181 /* Discard the reply objects list from the fake client */
8182 while(listLength(fakeClient->reply))
8183 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8184 /* Clean up, ready for the next command */
8185 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8186 zfree(argv);
8187 /* Handle swapping while loading big datasets when VM is on */
8188 loadedkeys++;
8189 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8190 while (zmalloc_used_memory() > server.vm_max_memory) {
8191 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8192 }
8193 }
8194 }
8195 fclose(fp);
8196 freeFakeClient(fakeClient);
8197 return REDIS_OK;
8198
8199 readerr:
8200 if (feof(fp)) {
8201 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8202 } else {
8203 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8204 }
8205 exit(1);
8206 fmterr:
8207 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8208 exit(1);
8209 }
8210
8211 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8212 static int fwriteBulkObject(FILE *fp, robj *obj) {
8213 char buf[128];
8214 int decrrc = 0;
8215
8216 /* Avoid the incr/decr ref count business if possible to help
8217 * copy-on-write (we are often in a child process when this function
8218 * is called).
8219 * Also makes sure that key objects don't get incrRefCount-ed when VM
8220 * is enabled */
8221 if (obj->encoding != REDIS_ENCODING_RAW) {
8222 obj = getDecodedObject(obj);
8223 decrrc = 1;
8224 }
8225 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8226 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8227 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8228 goto err;
8229 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8230 if (decrrc) decrRefCount(obj);
8231 return 1;
8232 err:
8233 if (decrrc) decrRefCount(obj);
8234 return 0;
8235 }
8236
8237 /* Write binary-safe string into a file in the bulkformat
8238 * $<count>\r\n<payload>\r\n */
8239 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8240 char buf[128];
8241
8242 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8243 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8244 if (len && fwrite(s,len,1,fp) == 0) return 0;
8245 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8246 return 1;
8247 }
8248
8249 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8250 static int fwriteBulkDouble(FILE *fp, double d) {
8251 char buf[128], dbuf[128];
8252
8253 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8254 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8255 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8256 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8257 return 1;
8258 }
8259
8260 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8261 static int fwriteBulkLong(FILE *fp, long l) {
8262 char buf[128], lbuf[128];
8263
8264 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8265 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8266 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8267 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8268 return 1;
8269 }
8270
8271 /* Write a sequence of commands able to fully rebuild the dataset into
8272 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8273 static int rewriteAppendOnlyFile(char *filename) {
8274 dictIterator *di = NULL;
8275 dictEntry *de;
8276 FILE *fp;
8277 char tmpfile[256];
8278 int j;
8279 time_t now = time(NULL);
8280
8281 /* Note that we have to use a different temp name here compared to the
8282 * one used by rewriteAppendOnlyFileBackground() function. */
8283 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8284 fp = fopen(tmpfile,"w");
8285 if (!fp) {
8286 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8287 return REDIS_ERR;
8288 }
8289 for (j = 0; j < server.dbnum; j++) {
8290 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8291 redisDb *db = server.db+j;
8292 dict *d = db->dict;
8293 if (dictSize(d) == 0) continue;
8294 di = dictGetIterator(d);
8295 if (!di) {
8296 fclose(fp);
8297 return REDIS_ERR;
8298 }
8299
8300 /* SELECT the new DB */
8301 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8302 if (fwriteBulkLong(fp,j) == 0) goto werr;
8303
8304 /* Iterate this DB writing every entry */
8305 while((de = dictNext(di)) != NULL) {
8306 robj *key, *o;
8307 time_t expiretime;
8308 int swapped;
8309
8310 key = dictGetEntryKey(de);
8311 /* If the value for this key is swapped, load a preview in memory.
8312 * We use a "swapped" flag to remember if we need to free the
8313 * value object instead to just increment the ref count anyway
8314 * in order to avoid copy-on-write of pages if we are forked() */
8315 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8316 key->storage == REDIS_VM_SWAPPING) {
8317 o = dictGetEntryVal(de);
8318 swapped = 0;
8319 } else {
8320 o = vmPreviewObject(key);
8321 swapped = 1;
8322 }
8323 expiretime = getExpire(db,key);
8324
8325 /* Save the key and associated value */
8326 if (o->type == REDIS_STRING) {
8327 /* Emit a SET command */
8328 char cmd[]="*3\r\n$3\r\nSET\r\n";
8329 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8330 /* Key and value */
8331 if (fwriteBulkObject(fp,key) == 0) goto werr;
8332 if (fwriteBulkObject(fp,o) == 0) goto werr;
8333 } else if (o->type == REDIS_LIST) {
8334 /* Emit the RPUSHes needed to rebuild the list */
8335 list *list = o->ptr;
8336 listNode *ln;
8337 listIter li;
8338
8339 listRewind(list,&li);
8340 while((ln = listNext(&li))) {
8341 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8342 robj *eleobj = listNodeValue(ln);
8343
8344 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8345 if (fwriteBulkObject(fp,key) == 0) goto werr;
8346 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8347 }
8348 } else if (o->type == REDIS_SET) {
8349 /* Emit the SADDs needed to rebuild the set */
8350 dict *set = o->ptr;
8351 dictIterator *di = dictGetIterator(set);
8352 dictEntry *de;
8353
8354 while((de = dictNext(di)) != NULL) {
8355 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8356 robj *eleobj = dictGetEntryKey(de);
8357
8358 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8359 if (fwriteBulkObject(fp,key) == 0) goto werr;
8360 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8361 }
8362 dictReleaseIterator(di);
8363 } else if (o->type == REDIS_ZSET) {
8364 /* Emit the ZADDs needed to rebuild the sorted set */
8365 zset *zs = o->ptr;
8366 dictIterator *di = dictGetIterator(zs->dict);
8367 dictEntry *de;
8368
8369 while((de = dictNext(di)) != NULL) {
8370 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8371 robj *eleobj = dictGetEntryKey(de);
8372 double *score = dictGetEntryVal(de);
8373
8374 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8375 if (fwriteBulkObject(fp,key) == 0) goto werr;
8376 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8377 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8378 }
8379 dictReleaseIterator(di);
8380 } else if (o->type == REDIS_HASH) {
8381 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8382
8383 /* Emit the HSETs needed to rebuild the hash */
8384 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8385 unsigned char *p = zipmapRewind(o->ptr);
8386 unsigned char *field, *val;
8387 unsigned int flen, vlen;
8388
8389 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8390 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8391 if (fwriteBulkObject(fp,key) == 0) goto werr;
8392 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8393 return -1;
8394 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8395 return -1;
8396 }
8397 } else {
8398 dictIterator *di = dictGetIterator(o->ptr);
8399 dictEntry *de;
8400
8401 while((de = dictNext(di)) != NULL) {
8402 robj *field = dictGetEntryKey(de);
8403 robj *val = dictGetEntryVal(de);
8404
8405 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8406 if (fwriteBulkObject(fp,key) == 0) goto werr;
8407 if (fwriteBulkObject(fp,field) == -1) return -1;
8408 if (fwriteBulkObject(fp,val) == -1) return -1;
8409 }
8410 dictReleaseIterator(di);
8411 }
8412 } else {
8413 redisPanic("Unknown object type");
8414 }
8415 /* Save the expire time */
8416 if (expiretime != -1) {
8417 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8418 /* If this key is already expired skip it */
8419 if (expiretime < now) continue;
8420 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8421 if (fwriteBulkObject(fp,key) == 0) goto werr;
8422 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8423 }
8424 if (swapped) decrRefCount(o);
8425 }
8426 dictReleaseIterator(di);
8427 }
8428
8429 /* Make sure data will not remain on the OS's output buffers */
8430 fflush(fp);
8431 fsync(fileno(fp));
8432 fclose(fp);
8433
8434 /* Use RENAME to make sure the DB file is changed atomically only
8435 * if the generate DB file is ok. */
8436 if (rename(tmpfile,filename) == -1) {
8437 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8438 unlink(tmpfile);
8439 return REDIS_ERR;
8440 }
8441 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8442 return REDIS_OK;
8443
8444 werr:
8445 fclose(fp);
8446 unlink(tmpfile);
8447 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8448 if (di) dictReleaseIterator(di);
8449 return REDIS_ERR;
8450 }
8451
8452 /* This is how rewriting of the append only file in background works:
8453 *
8454 * 1) The user calls BGREWRITEAOF
8455 * 2) Redis calls this function, that forks():
8456 * 2a) the child rewrite the append only file in a temp file.
8457 * 2b) the parent accumulates differences in server.bgrewritebuf.
8458 * 3) When the child finished '2a' exists.
8459 * 4) The parent will trap the exit code, if it's OK, will append the
8460 * data accumulated into server.bgrewritebuf into the temp file, and
8461 * finally will rename(2) the temp file in the actual file name.
8462 * The the new file is reopened as the new append only file. Profit!
8463 */
8464 static int rewriteAppendOnlyFileBackground(void) {
8465 pid_t childpid;
8466
8467 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8468 if (server.vm_enabled) waitEmptyIOJobsQueue();
8469 if ((childpid = fork()) == 0) {
8470 /* Child */
8471 char tmpfile[256];
8472
8473 if (server.vm_enabled) vmReopenSwapFile();
8474 close(server.fd);
8475 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8476 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8477 _exit(0);
8478 } else {
8479 _exit(1);
8480 }
8481 } else {
8482 /* Parent */
8483 if (childpid == -1) {
8484 redisLog(REDIS_WARNING,
8485 "Can't rewrite append only file in background: fork: %s",
8486 strerror(errno));
8487 return REDIS_ERR;
8488 }
8489 redisLog(REDIS_NOTICE,
8490 "Background append only file rewriting started by pid %d",childpid);
8491 server.bgrewritechildpid = childpid;
8492 updateDictResizePolicy();
8493 /* We set appendseldb to -1 in order to force the next call to the
8494 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8495 * accumulated by the parent into server.bgrewritebuf will start
8496 * with a SELECT statement and it will be safe to merge. */
8497 server.appendseldb = -1;
8498 return REDIS_OK;
8499 }
8500 return REDIS_OK; /* unreached */
8501 }
8502
8503 static void bgrewriteaofCommand(redisClient *c) {
8504 if (server.bgrewritechildpid != -1) {
8505 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8506 return;
8507 }
8508 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8509 char *status = "+Background append only file rewriting started\r\n";
8510 addReplySds(c,sdsnew(status));
8511 } else {
8512 addReply(c,shared.err);
8513 }
8514 }
8515
8516 static void aofRemoveTempFile(pid_t childpid) {
8517 char tmpfile[256];
8518
8519 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8520 unlink(tmpfile);
8521 }
8522
8523 /* Virtual Memory is composed mainly of two subsystems:
8524 * - Blocking Virutal Memory
8525 * - Threaded Virtual Memory I/O
8526 * The two parts are not fully decoupled, but functions are split among two
8527 * different sections of the source code (delimited by comments) in order to
8528 * make more clear what functionality is about the blocking VM and what about
8529 * the threaded (not blocking) VM.
8530 *
8531 * Redis VM design:
8532 *
8533 * Redis VM is a blocking VM (one that blocks reading swapped values from
8534 * disk into memory when a value swapped out is needed in memory) that is made
8535 * unblocking by trying to examine the command argument vector in order to
8536 * load in background values that will likely be needed in order to exec
8537 * the command. The command is executed only once all the relevant keys
8538 * are loaded into memory.
8539 *
8540 * This basically is almost as simple of a blocking VM, but almost as parallel
8541 * as a fully non-blocking VM.
8542 */
8543
8544 /* =================== Virtual Memory - Blocking Side ====================== */
8545
8546 /* substitute the first occurrence of '%p' with the process pid in the
8547 * swap file name. */
8548 static void expandVmSwapFilename(void) {
8549 char *p = strstr(server.vm_swap_file,"%p");
8550 sds new;
8551
8552 if (!p) return;
8553 new = sdsempty();
8554 *p = '\0';
8555 new = sdscat(new,server.vm_swap_file);
8556 new = sdscatprintf(new,"%ld",(long) getpid());
8557 new = sdscat(new,p+2);
8558 zfree(server.vm_swap_file);
8559 server.vm_swap_file = new;
8560 }
8561
8562 static void vmInit(void) {
8563 off_t totsize;
8564 int pipefds[2];
8565 size_t stacksize;
8566
8567 if (server.vm_max_threads != 0)
8568 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8569
8570 expandVmSwapFilename();
8571 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8572 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8573 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8574 }
8575 if (server.vm_fp == NULL) {
8576 redisLog(REDIS_WARNING,
8577 "Impossible to open the swap file: %s. Exiting.",
8578 strerror(errno));
8579 exit(1);
8580 }
8581 server.vm_fd = fileno(server.vm_fp);
8582 server.vm_next_page = 0;
8583 server.vm_near_pages = 0;
8584 server.vm_stats_used_pages = 0;
8585 server.vm_stats_swapped_objects = 0;
8586 server.vm_stats_swapouts = 0;
8587 server.vm_stats_swapins = 0;
8588 totsize = server.vm_pages*server.vm_page_size;
8589 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8590 if (ftruncate(server.vm_fd,totsize) == -1) {
8591 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8592 strerror(errno));
8593 exit(1);
8594 } else {
8595 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8596 }
8597 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8598 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8599 (long long) (server.vm_pages+7)/8, server.vm_pages);
8600 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8601
8602 /* Initialize threaded I/O (used by Virtual Memory) */
8603 server.io_newjobs = listCreate();
8604 server.io_processing = listCreate();
8605 server.io_processed = listCreate();
8606 server.io_ready_clients = listCreate();
8607 pthread_mutex_init(&server.io_mutex,NULL);
8608 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8609 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8610 server.io_active_threads = 0;
8611 if (pipe(pipefds) == -1) {
8612 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8613 ,strerror(errno));
8614 exit(1);
8615 }
8616 server.io_ready_pipe_read = pipefds[0];
8617 server.io_ready_pipe_write = pipefds[1];
8618 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8619 /* LZF requires a lot of stack */
8620 pthread_attr_init(&server.io_threads_attr);
8621 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8622 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8623 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8624 /* Listen for events in the threaded I/O pipe */
8625 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8626 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8627 oom("creating file event");
8628 }
8629
8630 /* Mark the page as used */
8631 static void vmMarkPageUsed(off_t page) {
8632 off_t byte = page/8;
8633 int bit = page&7;
8634 redisAssert(vmFreePage(page) == 1);
8635 server.vm_bitmap[byte] |= 1<<bit;
8636 }
8637
8638 /* Mark N contiguous pages as used, with 'page' being the first. */
8639 static void vmMarkPagesUsed(off_t page, off_t count) {
8640 off_t j;
8641
8642 for (j = 0; j < count; j++)
8643 vmMarkPageUsed(page+j);
8644 server.vm_stats_used_pages += count;
8645 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8646 (long long)count, (long long)page);
8647 }
8648
8649 /* Mark the page as free */
8650 static void vmMarkPageFree(off_t page) {
8651 off_t byte = page/8;
8652 int bit = page&7;
8653 redisAssert(vmFreePage(page) == 0);
8654 server.vm_bitmap[byte] &= ~(1<<bit);
8655 }
8656
8657 /* Mark N contiguous pages as free, with 'page' being the first. */
8658 static void vmMarkPagesFree(off_t page, off_t count) {
8659 off_t j;
8660
8661 for (j = 0; j < count; j++)
8662 vmMarkPageFree(page+j);
8663 server.vm_stats_used_pages -= count;
8664 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8665 (long long)count, (long long)page);
8666 }
8667
8668 /* Test if the page is free */
8669 static int vmFreePage(off_t page) {
8670 off_t byte = page/8;
8671 int bit = page&7;
8672 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8673 }
8674
8675 /* Find N contiguous free pages storing the first page of the cluster in *first.
8676 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8677 * REDIS_ERR is returned.
8678 *
8679 * This function uses a simple algorithm: we try to allocate
8680 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8681 * again from the start of the swap file searching for free spaces.
8682 *
8683 * If it looks pretty clear that there are no free pages near our offset
8684 * we try to find less populated places doing a forward jump of
8685 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8686 * without hurry, and then we jump again and so forth...
8687 *
8688 * This function can be improved using a free list to avoid to guess
8689 * too much, since we could collect data about freed pages.
8690 *
8691 * note: I implemented this function just after watching an episode of
8692 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8693 */
8694 static int vmFindContiguousPages(off_t *first, off_t n) {
8695 off_t base, offset = 0, since_jump = 0, numfree = 0;
8696
8697 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8698 server.vm_near_pages = 0;
8699 server.vm_next_page = 0;
8700 }
8701 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8702 base = server.vm_next_page;
8703
8704 while(offset < server.vm_pages) {
8705 off_t this = base+offset;
8706
8707 /* If we overflow, restart from page zero */
8708 if (this >= server.vm_pages) {
8709 this -= server.vm_pages;
8710 if (this == 0) {
8711 /* Just overflowed, what we found on tail is no longer
8712 * interesting, as it's no longer contiguous. */
8713 numfree = 0;
8714 }
8715 }
8716 if (vmFreePage(this)) {
8717 /* This is a free page */
8718 numfree++;
8719 /* Already got N free pages? Return to the caller, with success */
8720 if (numfree == n) {
8721 *first = this-(n-1);
8722 server.vm_next_page = this+1;
8723 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8724 return REDIS_OK;
8725 }
8726 } else {
8727 /* The current one is not a free page */
8728 numfree = 0;
8729 }
8730
8731 /* Fast-forward if the current page is not free and we already
8732 * searched enough near this place. */
8733 since_jump++;
8734 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8735 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8736 since_jump = 0;
8737 /* Note that even if we rewind after the jump, we are don't need
8738 * to make sure numfree is set to zero as we only jump *if* it
8739 * is set to zero. */
8740 } else {
8741 /* Otherwise just check the next page */
8742 offset++;
8743 }
8744 }
8745 return REDIS_ERR;
8746 }
8747
8748 /* Write the specified object at the specified page of the swap file */
8749 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8750 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8751 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8752 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8753 redisLog(REDIS_WARNING,
8754 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8755 strerror(errno));
8756 return REDIS_ERR;
8757 }
8758 rdbSaveObject(server.vm_fp,o);
8759 fflush(server.vm_fp);
8760 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8761 return REDIS_OK;
8762 }
8763
8764 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8765 * needed to later retrieve the object into the key object.
8766 * If we can't find enough contiguous empty pages to swap the object on disk
8767 * REDIS_ERR is returned. */
8768 static int vmSwapObjectBlocking(robj *key, robj *val) {
8769 off_t pages = rdbSavedObjectPages(val,NULL);
8770 off_t page;
8771
8772 assert(key->storage == REDIS_VM_MEMORY);
8773 assert(key->refcount == 1);
8774 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8775 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8776 key->vm.page = page;
8777 key->vm.usedpages = pages;
8778 key->storage = REDIS_VM_SWAPPED;
8779 key->vtype = val->type;
8780 decrRefCount(val); /* Deallocate the object from memory. */
8781 vmMarkPagesUsed(page,pages);
8782 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8783 (unsigned char*) key->ptr,
8784 (unsigned long long) page, (unsigned long long) pages);
8785 server.vm_stats_swapped_objects++;
8786 server.vm_stats_swapouts++;
8787 return REDIS_OK;
8788 }
8789
8790 static robj *vmReadObjectFromSwap(off_t page, int type) {
8791 robj *o;
8792
8793 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8794 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8795 redisLog(REDIS_WARNING,
8796 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8797 strerror(errno));
8798 _exit(1);
8799 }
8800 o = rdbLoadObject(type,server.vm_fp);
8801 if (o == NULL) {
8802 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8803 _exit(1);
8804 }
8805 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8806 return o;
8807 }
8808
8809 /* Load the value object relative to the 'key' object from swap to memory.
8810 * The newly allocated object is returned.
8811 *
8812 * If preview is true the unserialized object is returned to the caller but
8813 * no changes are made to the key object, nor the pages are marked as freed */
8814 static robj *vmGenericLoadObject(robj *key, int preview) {
8815 robj *val;
8816
8817 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8818 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8819 if (!preview) {
8820 key->storage = REDIS_VM_MEMORY;
8821 key->vm.atime = server.unixtime;
8822 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8823 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8824 (unsigned char*) key->ptr);
8825 server.vm_stats_swapped_objects--;
8826 } else {
8827 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8828 (unsigned char*) key->ptr);
8829 }
8830 server.vm_stats_swapins++;
8831 return val;
8832 }
8833
8834 /* Plain object loading, from swap to memory */
8835 static robj *vmLoadObject(robj *key) {
8836 /* If we are loading the object in background, stop it, we
8837 * need to load this object synchronously ASAP. */
8838 if (key->storage == REDIS_VM_LOADING)
8839 vmCancelThreadedIOJob(key);
8840 return vmGenericLoadObject(key,0);
8841 }
8842
8843 /* Just load the value on disk, without to modify the key.
8844 * This is useful when we want to perform some operation on the value
8845 * without to really bring it from swap to memory, like while saving the
8846 * dataset or rewriting the append only log. */
8847 static robj *vmPreviewObject(robj *key) {
8848 return vmGenericLoadObject(key,1);
8849 }
8850
8851 /* How a good candidate is this object for swapping?
8852 * The better candidate it is, the greater the returned value.
8853 *
8854 * Currently we try to perform a fast estimation of the object size in
8855 * memory, and combine it with aging informations.
8856 *
8857 * Basically swappability = idle-time * log(estimated size)
8858 *
8859 * Bigger objects are preferred over smaller objects, but not
8860 * proportionally, this is why we use the logarithm. This algorithm is
8861 * just a first try and will probably be tuned later. */
8862 static double computeObjectSwappability(robj *o) {
8863 time_t age = server.unixtime - o->vm.atime;
8864 long asize = 0;
8865 list *l;
8866 dict *d;
8867 struct dictEntry *de;
8868 int z;
8869
8870 if (age <= 0) return 0;
8871 switch(o->type) {
8872 case REDIS_STRING:
8873 if (o->encoding != REDIS_ENCODING_RAW) {
8874 asize = sizeof(*o);
8875 } else {
8876 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8877 }
8878 break;
8879 case REDIS_LIST:
8880 l = o->ptr;
8881 listNode *ln = listFirst(l);
8882
8883 asize = sizeof(list);
8884 if (ln) {
8885 robj *ele = ln->value;
8886 long elesize;
8887
8888 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8889 (sizeof(*o)+sdslen(ele->ptr)) :
8890 sizeof(*o);
8891 asize += (sizeof(listNode)+elesize)*listLength(l);
8892 }
8893 break;
8894 case REDIS_SET:
8895 case REDIS_ZSET:
8896 z = (o->type == REDIS_ZSET);
8897 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8898
8899 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8900 if (z) asize += sizeof(zset)-sizeof(dict);
8901 if (dictSize(d)) {
8902 long elesize;
8903 robj *ele;
8904
8905 de = dictGetRandomKey(d);
8906 ele = dictGetEntryKey(de);
8907 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8908 (sizeof(*o)+sdslen(ele->ptr)) :
8909 sizeof(*o);
8910 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8911 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8912 }
8913 break;
8914 case REDIS_HASH:
8915 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8916 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8917 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8918 unsigned int klen, vlen;
8919 unsigned char *key, *val;
8920
8921 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8922 klen = 0;
8923 vlen = 0;
8924 }
8925 asize = len*(klen+vlen+3);
8926 } else if (o->encoding == REDIS_ENCODING_HT) {
8927 d = o->ptr;
8928 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8929 if (dictSize(d)) {
8930 long elesize;
8931 robj *ele;
8932
8933 de = dictGetRandomKey(d);
8934 ele = dictGetEntryKey(de);
8935 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8936 (sizeof(*o)+sdslen(ele->ptr)) :
8937 sizeof(*o);
8938 ele = dictGetEntryVal(de);
8939 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8940 (sizeof(*o)+sdslen(ele->ptr)) :
8941 sizeof(*o);
8942 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8943 }
8944 }
8945 break;
8946 }
8947 return (double)age*log(1+asize);
8948 }
8949
8950 /* Try to swap an object that's a good candidate for swapping.
8951 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8952 * to swap any object at all.
8953 *
8954 * If 'usethreaded' is true, Redis will try to swap the object in background
8955 * using I/O threads. */
8956 static int vmSwapOneObject(int usethreads) {
8957 int j, i;
8958 struct dictEntry *best = NULL;
8959 double best_swappability = 0;
8960 redisDb *best_db = NULL;
8961 robj *key, *val;
8962
8963 for (j = 0; j < server.dbnum; j++) {
8964 redisDb *db = server.db+j;
8965 /* Why maxtries is set to 100?
8966 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8967 * are swappable objects */
8968 int maxtries = 100;
8969
8970 if (dictSize(db->dict) == 0) continue;
8971 for (i = 0; i < 5; i++) {
8972 dictEntry *de;
8973 double swappability;
8974
8975 if (maxtries) maxtries--;
8976 de = dictGetRandomKey(db->dict);
8977 key = dictGetEntryKey(de);
8978 val = dictGetEntryVal(de);
8979 /* Only swap objects that are currently in memory.
8980 *
8981 * Also don't swap shared objects if threaded VM is on, as we
8982 * try to ensure that the main thread does not touch the
8983 * object while the I/O thread is using it, but we can't
8984 * control other keys without adding additional mutex. */
8985 if (key->storage != REDIS_VM_MEMORY ||
8986 (server.vm_max_threads != 0 && val->refcount != 1)) {
8987 if (maxtries) i--; /* don't count this try */
8988 continue;
8989 }
8990 swappability = computeObjectSwappability(val);
8991 if (!best || swappability > best_swappability) {
8992 best = de;
8993 best_swappability = swappability;
8994 best_db = db;
8995 }
8996 }
8997 }
8998 if (best == NULL) return REDIS_ERR;
8999 key = dictGetEntryKey(best);
9000 val = dictGetEntryVal(best);
9001
9002 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
9003 key->ptr, best_swappability);
9004
9005 /* Unshare the key if needed */
9006 if (key->refcount > 1) {
9007 robj *newkey = dupStringObject(key);
9008 decrRefCount(key);
9009 key = dictGetEntryKey(best) = newkey;
9010 }
9011 /* Swap it */
9012 if (usethreads) {
9013 vmSwapObjectThreaded(key,val,best_db);
9014 return REDIS_OK;
9015 } else {
9016 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9017 dictGetEntryVal(best) = NULL;
9018 return REDIS_OK;
9019 } else {
9020 return REDIS_ERR;
9021 }
9022 }
9023 }
9024
9025 static int vmSwapOneObjectBlocking() {
9026 return vmSwapOneObject(0);
9027 }
9028
9029 static int vmSwapOneObjectThreaded() {
9030 return vmSwapOneObject(1);
9031 }
9032
9033 /* Return true if it's safe to swap out objects in a given moment.
9034 * Basically we don't want to swap objects out while there is a BGSAVE
9035 * or a BGAEOREWRITE running in backgroud. */
9036 static int vmCanSwapOut(void) {
9037 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9038 }
9039
9040 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
9041 * and was deleted. Otherwise 0 is returned. */
9042 static int deleteIfSwapped(redisDb *db, robj *key) {
9043 dictEntry *de;
9044 robj *foundkey;
9045
9046 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9047 foundkey = dictGetEntryKey(de);
9048 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9049 deleteKey(db,key);
9050 return 1;
9051 }
9052
9053 /* =================== Virtual Memory - Threaded I/O ======================= */
9054
9055 static void freeIOJob(iojob *j) {
9056 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9057 j->type == REDIS_IOJOB_DO_SWAP ||
9058 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
9059 decrRefCount(j->val);
9060 /* We don't decrRefCount the j->key field as we did't incremented
9061 * the count creating IO Jobs. This is because the key field here is
9062 * just used as an indentifier and if a key is removed the Job should
9063 * never be touched again. */
9064 zfree(j);
9065 }
9066
9067 /* Every time a thread finished a Job, it writes a byte into the write side
9068 * of an unix pipe in order to "awake" the main thread, and this function
9069 * is called. */
9070 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9071 int mask)
9072 {
9073 char buf[1];
9074 int retval, processed = 0, toprocess = -1, trytoswap = 1;
9075 REDIS_NOTUSED(el);
9076 REDIS_NOTUSED(mask);
9077 REDIS_NOTUSED(privdata);
9078
9079 /* For every byte we read in the read side of the pipe, there is one
9080 * I/O job completed to process. */
9081 while((retval = read(fd,buf,1)) == 1) {
9082 iojob *j;
9083 listNode *ln;
9084 robj *key;
9085 struct dictEntry *de;
9086
9087 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9088
9089 /* Get the processed element (the oldest one) */
9090 lockThreadedIO();
9091 assert(listLength(server.io_processed) != 0);
9092 if (toprocess == -1) {
9093 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9094 if (toprocess <= 0) toprocess = 1;
9095 }
9096 ln = listFirst(server.io_processed);
9097 j = ln->value;
9098 listDelNode(server.io_processed,ln);
9099 unlockThreadedIO();
9100 /* If this job is marked as canceled, just ignore it */
9101 if (j->canceled) {
9102 freeIOJob(j);
9103 continue;
9104 }
9105 /* Post process it in the main thread, as there are things we
9106 * can do just here to avoid race conditions and/or invasive locks */
9107 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9108 de = dictFind(j->db->dict,j->key);
9109 assert(de != NULL);
9110 key = dictGetEntryKey(de);
9111 if (j->type == REDIS_IOJOB_LOAD) {
9112 redisDb *db;
9113
9114 /* Key loaded, bring it at home */
9115 key->storage = REDIS_VM_MEMORY;
9116 key->vm.atime = server.unixtime;
9117 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9118 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9119 (unsigned char*) key->ptr);
9120 server.vm_stats_swapped_objects--;
9121 server.vm_stats_swapins++;
9122 dictGetEntryVal(de) = j->val;
9123 incrRefCount(j->val);
9124 db = j->db;
9125 freeIOJob(j);
9126 /* Handle clients waiting for this key to be loaded. */
9127 handleClientsBlockedOnSwappedKey(db,key);
9128 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9129 /* Now we know the amount of pages required to swap this object.
9130 * Let's find some space for it, and queue this task again
9131 * rebranded as REDIS_IOJOB_DO_SWAP. */
9132 if (!vmCanSwapOut() ||
9133 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9134 {
9135 /* Ooops... no space or we can't swap as there is
9136 * a fork()ed Redis trying to save stuff on disk. */
9137 freeIOJob(j);
9138 key->storage = REDIS_VM_MEMORY; /* undo operation */
9139 } else {
9140 /* Note that we need to mark this pages as used now,
9141 * if the job will be canceled, we'll mark them as freed
9142 * again. */
9143 vmMarkPagesUsed(j->page,j->pages);
9144 j->type = REDIS_IOJOB_DO_SWAP;
9145 lockThreadedIO();
9146 queueIOJob(j);
9147 unlockThreadedIO();
9148 }
9149 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9150 robj *val;
9151
9152 /* Key swapped. We can finally free some memory. */
9153 if (key->storage != REDIS_VM_SWAPPING) {
9154 printf("key->storage: %d\n",key->storage);
9155 printf("key->name: %s\n",(char*)key->ptr);
9156 printf("key->refcount: %d\n",key->refcount);
9157 printf("val: %p\n",(void*)j->val);
9158 printf("val->type: %d\n",j->val->type);
9159 printf("val->ptr: %s\n",(char*)j->val->ptr);
9160 }
9161 redisAssert(key->storage == REDIS_VM_SWAPPING);
9162 val = dictGetEntryVal(de);
9163 key->vm.page = j->page;
9164 key->vm.usedpages = j->pages;
9165 key->storage = REDIS_VM_SWAPPED;
9166 key->vtype = j->val->type;
9167 decrRefCount(val); /* Deallocate the object from memory. */
9168 dictGetEntryVal(de) = NULL;
9169 redisLog(REDIS_DEBUG,
9170 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9171 (unsigned char*) key->ptr,
9172 (unsigned long long) j->page, (unsigned long long) j->pages);
9173 server.vm_stats_swapped_objects++;
9174 server.vm_stats_swapouts++;
9175 freeIOJob(j);
9176 /* Put a few more swap requests in queue if we are still
9177 * out of memory */
9178 if (trytoswap && vmCanSwapOut() &&
9179 zmalloc_used_memory() > server.vm_max_memory)
9180 {
9181 int more = 1;
9182 while(more) {
9183 lockThreadedIO();
9184 more = listLength(server.io_newjobs) <
9185 (unsigned) server.vm_max_threads;
9186 unlockThreadedIO();
9187 /* Don't waste CPU time if swappable objects are rare. */
9188 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9189 trytoswap = 0;
9190 break;
9191 }
9192 }
9193 }
9194 }
9195 processed++;
9196 if (processed == toprocess) return;
9197 }
9198 if (retval < 0 && errno != EAGAIN) {
9199 redisLog(REDIS_WARNING,
9200 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9201 strerror(errno));
9202 }
9203 }
9204
9205 static void lockThreadedIO(void) {
9206 pthread_mutex_lock(&server.io_mutex);
9207 }
9208
9209 static void unlockThreadedIO(void) {
9210 pthread_mutex_unlock(&server.io_mutex);
9211 }
9212
9213 /* Remove the specified object from the threaded I/O queue if still not
9214 * processed, otherwise make sure to flag it as canceled. */
9215 static void vmCancelThreadedIOJob(robj *o) {
9216 list *lists[3] = {
9217 server.io_newjobs, /* 0 */
9218 server.io_processing, /* 1 */
9219 server.io_processed /* 2 */
9220 };
9221 int i;
9222
9223 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9224 again:
9225 lockThreadedIO();
9226 /* Search for a matching key in one of the queues */
9227 for (i = 0; i < 3; i++) {
9228 listNode *ln;
9229 listIter li;
9230
9231 listRewind(lists[i],&li);
9232 while ((ln = listNext(&li)) != NULL) {
9233 iojob *job = ln->value;
9234
9235 if (job->canceled) continue; /* Skip this, already canceled. */
9236 if (job->key == o) {
9237 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9238 (void*)job, (char*)o->ptr, job->type, i);
9239 /* Mark the pages as free since the swap didn't happened
9240 * or happened but is now discarded. */
9241 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9242 vmMarkPagesFree(job->page,job->pages);
9243 /* Cancel the job. It depends on the list the job is
9244 * living in. */
9245 switch(i) {
9246 case 0: /* io_newjobs */
9247 /* If the job was yet not processed the best thing to do
9248 * is to remove it from the queue at all */
9249 freeIOJob(job);
9250 listDelNode(lists[i],ln);
9251 break;
9252 case 1: /* io_processing */
9253 /* Oh Shi- the thread is messing with the Job:
9254 *
9255 * Probably it's accessing the object if this is a
9256 * PREPARE_SWAP or DO_SWAP job.
9257 * If it's a LOAD job it may be reading from disk and
9258 * if we don't wait for the job to terminate before to
9259 * cancel it, maybe in a few microseconds data can be
9260 * corrupted in this pages. So the short story is:
9261 *
9262 * Better to wait for the job to move into the
9263 * next queue (processed)... */
9264
9265 /* We try again and again until the job is completed. */
9266 unlockThreadedIO();
9267 /* But let's wait some time for the I/O thread
9268 * to finish with this job. After all this condition
9269 * should be very rare. */
9270 usleep(1);
9271 goto again;
9272 case 2: /* io_processed */
9273 /* The job was already processed, that's easy...
9274 * just mark it as canceled so that we'll ignore it
9275 * when processing completed jobs. */
9276 job->canceled = 1;
9277 break;
9278 }
9279 /* Finally we have to adjust the storage type of the object
9280 * in order to "UNDO" the operaiton. */
9281 if (o->storage == REDIS_VM_LOADING)
9282 o->storage = REDIS_VM_SWAPPED;
9283 else if (o->storage == REDIS_VM_SWAPPING)
9284 o->storage = REDIS_VM_MEMORY;
9285 unlockThreadedIO();
9286 return;
9287 }
9288 }
9289 }
9290 unlockThreadedIO();
9291 assert(1 != 1); /* We should never reach this */
9292 }
9293
9294 static void *IOThreadEntryPoint(void *arg) {
9295 iojob *j;
9296 listNode *ln;
9297 REDIS_NOTUSED(arg);
9298
9299 pthread_detach(pthread_self());
9300 while(1) {
9301 /* Get a new job to process */
9302 lockThreadedIO();
9303 if (listLength(server.io_newjobs) == 0) {
9304 /* No new jobs in queue, exit. */
9305 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9306 (long) pthread_self());
9307 server.io_active_threads--;
9308 unlockThreadedIO();
9309 return NULL;
9310 }
9311 ln = listFirst(server.io_newjobs);
9312 j = ln->value;
9313 listDelNode(server.io_newjobs,ln);
9314 /* Add the job in the processing queue */
9315 j->thread = pthread_self();
9316 listAddNodeTail(server.io_processing,j);
9317 ln = listLast(server.io_processing); /* We use ln later to remove it */
9318 unlockThreadedIO();
9319 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9320 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9321
9322 /* Process the Job */
9323 if (j->type == REDIS_IOJOB_LOAD) {
9324 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9325 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9326 FILE *fp = fopen("/dev/null","w+");
9327 j->pages = rdbSavedObjectPages(j->val,fp);
9328 fclose(fp);
9329 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9330 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9331 j->canceled = 1;
9332 }
9333
9334 /* Done: insert the job into the processed queue */
9335 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9336 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9337 lockThreadedIO();
9338 listDelNode(server.io_processing,ln);
9339 listAddNodeTail(server.io_processed,j);
9340 unlockThreadedIO();
9341
9342 /* Signal the main thread there is new stuff to process */
9343 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9344 }
9345 return NULL; /* never reached */
9346 }
9347
9348 static void spawnIOThread(void) {
9349 pthread_t thread;
9350 sigset_t mask, omask;
9351 int err;
9352
9353 sigemptyset(&mask);
9354 sigaddset(&mask,SIGCHLD);
9355 sigaddset(&mask,SIGHUP);
9356 sigaddset(&mask,SIGPIPE);
9357 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9358 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9359 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9360 strerror(err));
9361 usleep(1000000);
9362 }
9363 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9364 server.io_active_threads++;
9365 }
9366
9367 /* We need to wait for the last thread to exit before we are able to
9368 * fork() in order to BGSAVE or BGREWRITEAOF. */
9369 static void waitEmptyIOJobsQueue(void) {
9370 while(1) {
9371 int io_processed_len;
9372
9373 lockThreadedIO();
9374 if (listLength(server.io_newjobs) == 0 &&
9375 listLength(server.io_processing) == 0 &&
9376 server.io_active_threads == 0)
9377 {
9378 unlockThreadedIO();
9379 return;
9380 }
9381 /* While waiting for empty jobs queue condition we post-process some
9382 * finshed job, as I/O threads may be hanging trying to write against
9383 * the io_ready_pipe_write FD but there are so much pending jobs that
9384 * it's blocking. */
9385 io_processed_len = listLength(server.io_processed);
9386 unlockThreadedIO();
9387 if (io_processed_len) {
9388 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9389 usleep(1000); /* 1 millisecond */
9390 } else {
9391 usleep(10000); /* 10 milliseconds */
9392 }
9393 }
9394 }
9395
9396 static void vmReopenSwapFile(void) {
9397 /* Note: we don't close the old one as we are in the child process
9398 * and don't want to mess at all with the original file object. */
9399 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9400 if (server.vm_fp == NULL) {
9401 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9402 server.vm_swap_file);
9403 _exit(1);
9404 }
9405 server.vm_fd = fileno(server.vm_fp);
9406 }
9407
9408 /* This function must be called while with threaded IO locked */
9409 static void queueIOJob(iojob *j) {
9410 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9411 (void*)j, j->type, (char*)j->key->ptr);
9412 listAddNodeTail(server.io_newjobs,j);
9413 if (server.io_active_threads < server.vm_max_threads)
9414 spawnIOThread();
9415 }
9416
9417 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9418 iojob *j;
9419
9420 assert(key->storage == REDIS_VM_MEMORY);
9421 assert(key->refcount == 1);
9422
9423 j = zmalloc(sizeof(*j));
9424 j->type = REDIS_IOJOB_PREPARE_SWAP;
9425 j->db = db;
9426 j->key = key;
9427 j->val = val;
9428 incrRefCount(val);
9429 j->canceled = 0;
9430 j->thread = (pthread_t) -1;
9431 key->storage = REDIS_VM_SWAPPING;
9432
9433 lockThreadedIO();
9434 queueIOJob(j);
9435 unlockThreadedIO();
9436 return REDIS_OK;
9437 }
9438
9439 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9440
9441 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9442 * If there is not already a job loading the key, it is craeted.
9443 * The key is added to the io_keys list in the client structure, and also
9444 * in the hash table mapping swapped keys to waiting clients, that is,
9445 * server.io_waited_keys. */
9446 static int waitForSwappedKey(redisClient *c, robj *key) {
9447 struct dictEntry *de;
9448 robj *o;
9449 list *l;
9450
9451 /* If the key does not exist or is already in RAM we don't need to
9452 * block the client at all. */
9453 de = dictFind(c->db->dict,key);
9454 if (de == NULL) return 0;
9455 o = dictGetEntryKey(de);
9456 if (o->storage == REDIS_VM_MEMORY) {
9457 return 0;
9458 } else if (o->storage == REDIS_VM_SWAPPING) {
9459 /* We were swapping the key, undo it! */
9460 vmCancelThreadedIOJob(o);
9461 return 0;
9462 }
9463
9464 /* OK: the key is either swapped, or being loaded just now. */
9465
9466 /* Add the key to the list of keys this client is waiting for.
9467 * This maps clients to keys they are waiting for. */
9468 listAddNodeTail(c->io_keys,key);
9469 incrRefCount(key);
9470
9471 /* Add the client to the swapped keys => clients waiting map. */
9472 de = dictFind(c->db->io_keys,key);
9473 if (de == NULL) {
9474 int retval;
9475
9476 /* For every key we take a list of clients blocked for it */
9477 l = listCreate();
9478 retval = dictAdd(c->db->io_keys,key,l);
9479 incrRefCount(key);
9480 assert(retval == DICT_OK);
9481 } else {
9482 l = dictGetEntryVal(de);
9483 }
9484 listAddNodeTail(l,c);
9485
9486 /* Are we already loading the key from disk? If not create a job */
9487 if (o->storage == REDIS_VM_SWAPPED) {
9488 iojob *j;
9489
9490 o->storage = REDIS_VM_LOADING;
9491 j = zmalloc(sizeof(*j));
9492 j->type = REDIS_IOJOB_LOAD;
9493 j->db = c->db;
9494 j->key = o;
9495 j->key->vtype = o->vtype;
9496 j->page = o->vm.page;
9497 j->val = NULL;
9498 j->canceled = 0;
9499 j->thread = (pthread_t) -1;
9500 lockThreadedIO();
9501 queueIOJob(j);
9502 unlockThreadedIO();
9503 }
9504 return 1;
9505 }
9506
9507 /* Preload keys needed for the ZUNION and ZINTER commands. */
9508 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9509 int i, num;
9510 num = atoi(c->argv[2]->ptr);
9511 for (i = 0; i < num; i++) {
9512 waitForSwappedKey(c,c->argv[3+i]);
9513 }
9514 }
9515
9516 /* Is this client attempting to run a command against swapped keys?
9517 * If so, block it ASAP, load the keys in background, then resume it.
9518 *
9519 * The important idea about this function is that it can fail! If keys will
9520 * still be swapped when the client is resumed, this key lookups will
9521 * just block loading keys from disk. In practical terms this should only
9522 * happen with SORT BY command or if there is a bug in this function.
9523 *
9524 * Return 1 if the client is marked as blocked, 0 if the client can
9525 * continue as the keys it is going to access appear to be in memory. */
9526 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9527 int j, last;
9528
9529 if (cmd->vm_preload_proc != NULL) {
9530 cmd->vm_preload_proc(c);
9531 } else {
9532 if (cmd->vm_firstkey == 0) return 0;
9533 last = cmd->vm_lastkey;
9534 if (last < 0) last = c->argc+last;
9535 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9536 waitForSwappedKey(c,c->argv[j]);
9537 }
9538
9539 /* If the client was blocked for at least one key, mark it as blocked. */
9540 if (listLength(c->io_keys)) {
9541 c->flags |= REDIS_IO_WAIT;
9542 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9543 server.vm_blocked_clients++;
9544 return 1;
9545 } else {
9546 return 0;
9547 }
9548 }
9549
9550 /* Remove the 'key' from the list of blocked keys for a given client.
9551 *
9552 * The function returns 1 when there are no longer blocking keys after
9553 * the current one was removed (and the client can be unblocked). */
9554 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9555 list *l;
9556 listNode *ln;
9557 listIter li;
9558 struct dictEntry *de;
9559
9560 /* Remove the key from the list of keys this client is waiting for. */
9561 listRewind(c->io_keys,&li);
9562 while ((ln = listNext(&li)) != NULL) {
9563 if (compareStringObjects(ln->value,key) == 0) {
9564 listDelNode(c->io_keys,ln);
9565 break;
9566 }
9567 }
9568 assert(ln != NULL);
9569
9570 /* Remove the client form the key => waiting clients map. */
9571 de = dictFind(c->db->io_keys,key);
9572 assert(de != NULL);
9573 l = dictGetEntryVal(de);
9574 ln = listSearchKey(l,c);
9575 assert(ln != NULL);
9576 listDelNode(l,ln);
9577 if (listLength(l) == 0)
9578 dictDelete(c->db->io_keys,key);
9579
9580 return listLength(c->io_keys) == 0;
9581 }
9582
9583 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9584 struct dictEntry *de;
9585 list *l;
9586 listNode *ln;
9587 int len;
9588
9589 de = dictFind(db->io_keys,key);
9590 if (!de) return;
9591
9592 l = dictGetEntryVal(de);
9593 len = listLength(l);
9594 /* Note: we can't use something like while(listLength(l)) as the list
9595 * can be freed by the calling function when we remove the last element. */
9596 while (len--) {
9597 ln = listFirst(l);
9598 redisClient *c = ln->value;
9599
9600 if (dontWaitForSwappedKey(c,key)) {
9601 /* Put the client in the list of clients ready to go as we
9602 * loaded all the keys about it. */
9603 listAddNodeTail(server.io_ready_clients,c);
9604 }
9605 }
9606 }
9607
9608 /* =========================== Remote Configuration ========================= */
9609
9610 static void configSetCommand(redisClient *c) {
9611 robj *o = getDecodedObject(c->argv[3]);
9612 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9613 zfree(server.dbfilename);
9614 server.dbfilename = zstrdup(o->ptr);
9615 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9616 zfree(server.requirepass);
9617 server.requirepass = zstrdup(o->ptr);
9618 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9619 zfree(server.masterauth);
9620 server.masterauth = zstrdup(o->ptr);
9621 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9622 server.maxmemory = strtoll(o->ptr, NULL, 10);
9623 } else {
9624 addReplySds(c,sdscatprintf(sdsempty(),
9625 "-ERR not supported CONFIG parameter %s\r\n",
9626 (char*)c->argv[2]->ptr));
9627 decrRefCount(o);
9628 return;
9629 }
9630 decrRefCount(o);
9631 addReply(c,shared.ok);
9632 }
9633
9634 static void configGetCommand(redisClient *c) {
9635 robj *o = getDecodedObject(c->argv[2]);
9636 robj *lenobj = createObject(REDIS_STRING,NULL);
9637 char *pattern = o->ptr;
9638 int matches = 0;
9639
9640 addReply(c,lenobj);
9641 decrRefCount(lenobj);
9642
9643 if (stringmatch(pattern,"dbfilename",0)) {
9644 addReplyBulkCString(c,"dbfilename");
9645 addReplyBulkCString(c,server.dbfilename);
9646 matches++;
9647 }
9648 if (stringmatch(pattern,"requirepass",0)) {
9649 addReplyBulkCString(c,"requirepass");
9650 addReplyBulkCString(c,server.requirepass);
9651 matches++;
9652 }
9653 if (stringmatch(pattern,"masterauth",0)) {
9654 addReplyBulkCString(c,"masterauth");
9655 addReplyBulkCString(c,server.masterauth);
9656 matches++;
9657 }
9658 if (stringmatch(pattern,"maxmemory",0)) {
9659 char buf[128];
9660
9661 snprintf(buf,128,"%llu\n",server.maxmemory);
9662 addReplyBulkCString(c,"maxmemory");
9663 addReplyBulkCString(c,buf);
9664 matches++;
9665 }
9666 decrRefCount(o);
9667 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9668 }
9669
9670 static void configCommand(redisClient *c) {
9671 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9672 if (c->argc != 4) goto badarity;
9673 configSetCommand(c);
9674 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9675 if (c->argc != 3) goto badarity;
9676 configGetCommand(c);
9677 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9678 if (c->argc != 2) goto badarity;
9679 server.stat_numcommands = 0;
9680 server.stat_numconnections = 0;
9681 server.stat_expiredkeys = 0;
9682 server.stat_starttime = time(NULL);
9683 addReply(c,shared.ok);
9684 } else {
9685 addReplySds(c,sdscatprintf(sdsempty(),
9686 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9687 }
9688 return;
9689
9690 badarity:
9691 addReplySds(c,sdscatprintf(sdsempty(),
9692 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9693 (char*) c->argv[1]->ptr));
9694 }
9695
9696 /* =========================== Pubsub implementation ======================== */
9697
9698 static void freePubsubPattern(void *p) {
9699 pubsubPattern *pat = p;
9700
9701 decrRefCount(pat->pattern);
9702 zfree(pat);
9703 }
9704
9705 static int listMatchPubsubPattern(void *a, void *b) {
9706 pubsubPattern *pa = a, *pb = b;
9707
9708 return (pa->client == pb->client) &&
9709 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9710 }
9711
9712 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9713 * 0 if the client was already subscribed to that channel. */
9714 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9715 struct dictEntry *de;
9716 list *clients = NULL;
9717 int retval = 0;
9718
9719 /* Add the channel to the client -> channels hash table */
9720 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9721 retval = 1;
9722 incrRefCount(channel);
9723 /* Add the client to the channel -> list of clients hash table */
9724 de = dictFind(server.pubsub_channels,channel);
9725 if (de == NULL) {
9726 clients = listCreate();
9727 dictAdd(server.pubsub_channels,channel,clients);
9728 incrRefCount(channel);
9729 } else {
9730 clients = dictGetEntryVal(de);
9731 }
9732 listAddNodeTail(clients,c);
9733 }
9734 /* Notify the client */
9735 addReply(c,shared.mbulk3);
9736 addReply(c,shared.subscribebulk);
9737 addReplyBulk(c,channel);
9738 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9739 return retval;
9740 }
9741
9742 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9743 * 0 if the client was not subscribed to the specified channel. */
9744 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9745 struct dictEntry *de;
9746 list *clients;
9747 listNode *ln;
9748 int retval = 0;
9749
9750 /* Remove the channel from the client -> channels hash table */
9751 incrRefCount(channel); /* channel may be just a pointer to the same object
9752 we have in the hash tables. Protect it... */
9753 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9754 retval = 1;
9755 /* Remove the client from the channel -> clients list hash table */
9756 de = dictFind(server.pubsub_channels,channel);
9757 assert(de != NULL);
9758 clients = dictGetEntryVal(de);
9759 ln = listSearchKey(clients,c);
9760 assert(ln != NULL);
9761 listDelNode(clients,ln);
9762 if (listLength(clients) == 0) {
9763 /* Free the list and associated hash entry at all if this was
9764 * the latest client, so that it will be possible to abuse
9765 * Redis PUBSUB creating millions of channels. */
9766 dictDelete(server.pubsub_channels,channel);
9767 }
9768 }
9769 /* Notify the client */
9770 if (notify) {
9771 addReply(c,shared.mbulk3);
9772 addReply(c,shared.unsubscribebulk);
9773 addReplyBulk(c,channel);
9774 addReplyLong(c,dictSize(c->pubsub_channels)+
9775 listLength(c->pubsub_patterns));
9776
9777 }
9778 decrRefCount(channel); /* it is finally safe to release it */
9779 return retval;
9780 }
9781
9782 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9783 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9784 int retval = 0;
9785
9786 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9787 retval = 1;
9788 pubsubPattern *pat;
9789 listAddNodeTail(c->pubsub_patterns,pattern);
9790 incrRefCount(pattern);
9791 pat = zmalloc(sizeof(*pat));
9792 pat->pattern = getDecodedObject(pattern);
9793 pat->client = c;
9794 listAddNodeTail(server.pubsub_patterns,pat);
9795 }
9796 /* Notify the client */
9797 addReply(c,shared.mbulk3);
9798 addReply(c,shared.psubscribebulk);
9799 addReplyBulk(c,pattern);
9800 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9801 return retval;
9802 }
9803
9804 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9805 * 0 if the client was not subscribed to the specified channel. */
9806 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9807 listNode *ln;
9808 pubsubPattern pat;
9809 int retval = 0;
9810
9811 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9812 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9813 retval = 1;
9814 listDelNode(c->pubsub_patterns,ln);
9815 pat.client = c;
9816 pat.pattern = pattern;
9817 ln = listSearchKey(server.pubsub_patterns,&pat);
9818 listDelNode(server.pubsub_patterns,ln);
9819 }
9820 /* Notify the client */
9821 if (notify) {
9822 addReply(c,shared.mbulk3);
9823 addReply(c,shared.punsubscribebulk);
9824 addReplyBulk(c,pattern);
9825 addReplyLong(c,dictSize(c->pubsub_channels)+
9826 listLength(c->pubsub_patterns));
9827 }
9828 decrRefCount(pattern);
9829 return retval;
9830 }
9831
9832 /* Unsubscribe from all the channels. Return the number of channels the
9833 * client was subscribed from. */
9834 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9835 dictIterator *di = dictGetIterator(c->pubsub_channels);
9836 dictEntry *de;
9837 int count = 0;
9838
9839 while((de = dictNext(di)) != NULL) {
9840 robj *channel = dictGetEntryKey(de);
9841
9842 count += pubsubUnsubscribeChannel(c,channel,notify);
9843 }
9844 dictReleaseIterator(di);
9845 return count;
9846 }
9847
9848 /* Unsubscribe from all the patterns. Return the number of patterns the
9849 * client was subscribed from. */
9850 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9851 listNode *ln;
9852 listIter li;
9853 int count = 0;
9854
9855 listRewind(c->pubsub_patterns,&li);
9856 while ((ln = listNext(&li)) != NULL) {
9857 robj *pattern = ln->value;
9858
9859 count += pubsubUnsubscribePattern(c,pattern,notify);
9860 }
9861 return count;
9862 }
9863
9864 /* Publish a message */
9865 static int pubsubPublishMessage(robj *channel, robj *message) {
9866 int receivers = 0;
9867 struct dictEntry *de;
9868 listNode *ln;
9869 listIter li;
9870
9871 /* Send to clients listening for that channel */
9872 de = dictFind(server.pubsub_channels,channel);
9873 if (de) {
9874 list *list = dictGetEntryVal(de);
9875 listNode *ln;
9876 listIter li;
9877
9878 listRewind(list,&li);
9879 while ((ln = listNext(&li)) != NULL) {
9880 redisClient *c = ln->value;
9881
9882 addReply(c,shared.mbulk3);
9883 addReply(c,shared.messagebulk);
9884 addReplyBulk(c,channel);
9885 addReplyBulk(c,message);
9886 receivers++;
9887 }
9888 }
9889 /* Send to clients listening to matching channels */
9890 if (listLength(server.pubsub_patterns)) {
9891 listRewind(server.pubsub_patterns,&li);
9892 channel = getDecodedObject(channel);
9893 while ((ln = listNext(&li)) != NULL) {
9894 pubsubPattern *pat = ln->value;
9895
9896 if (stringmatchlen((char*)pat->pattern->ptr,
9897 sdslen(pat->pattern->ptr),
9898 (char*)channel->ptr,
9899 sdslen(channel->ptr),0)) {
9900 addReply(pat->client,shared.mbulk4);
9901 addReply(pat->client,shared.pmessagebulk);
9902 addReplyBulk(pat->client,pat->pattern);
9903 addReplyBulk(pat->client,channel);
9904 addReplyBulk(pat->client,message);
9905 receivers++;
9906 }
9907 }
9908 decrRefCount(channel);
9909 }
9910 return receivers;
9911 }
9912
9913 static void subscribeCommand(redisClient *c) {
9914 int j;
9915
9916 for (j = 1; j < c->argc; j++)
9917 pubsubSubscribeChannel(c,c->argv[j]);
9918 }
9919
9920 static void unsubscribeCommand(redisClient *c) {
9921 if (c->argc == 1) {
9922 pubsubUnsubscribeAllChannels(c,1);
9923 return;
9924 } else {
9925 int j;
9926
9927 for (j = 1; j < c->argc; j++)
9928 pubsubUnsubscribeChannel(c,c->argv[j],1);
9929 }
9930 }
9931
9932 static void psubscribeCommand(redisClient *c) {
9933 int j;
9934
9935 for (j = 1; j < c->argc; j++)
9936 pubsubSubscribePattern(c,c->argv[j]);
9937 }
9938
9939 static void punsubscribeCommand(redisClient *c) {
9940 if (c->argc == 1) {
9941 pubsubUnsubscribeAllPatterns(c,1);
9942 return;
9943 } else {
9944 int j;
9945
9946 for (j = 1; j < c->argc; j++)
9947 pubsubUnsubscribePattern(c,c->argv[j],1);
9948 }
9949 }
9950
9951 static void publishCommand(redisClient *c) {
9952 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9953 addReplyLong(c,receivers);
9954 }
9955
9956 /* ================================= Debugging ============================== */
9957
9958 static void debugCommand(redisClient *c) {
9959 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9960 *((char*)-1) = 'x';
9961 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9962 if (rdbSave(server.dbfilename) != REDIS_OK) {
9963 addReply(c,shared.err);
9964 return;
9965 }
9966 emptyDb();
9967 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9968 addReply(c,shared.err);
9969 return;
9970 }
9971 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9972 addReply(c,shared.ok);
9973 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9974 emptyDb();
9975 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9976 addReply(c,shared.err);
9977 return;
9978 }
9979 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9980 addReply(c,shared.ok);
9981 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9982 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9983 robj *key, *val;
9984
9985 if (!de) {
9986 addReply(c,shared.nokeyerr);
9987 return;
9988 }
9989 key = dictGetEntryKey(de);
9990 val = dictGetEntryVal(de);
9991 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9992 key->storage == REDIS_VM_SWAPPING)) {
9993 char *strenc;
9994 char buf[128];
9995
9996 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9997 strenc = strencoding[val->encoding];
9998 } else {
9999 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10000 strenc = buf;
10001 }
10002 addReplySds(c,sdscatprintf(sdsempty(),
10003 "+Key at:%p refcount:%d, value at:%p refcount:%d "
10004 "encoding:%s serializedlength:%lld\r\n",
10005 (void*)key, key->refcount, (void*)val, val->refcount,
10006 strenc, (long long) rdbSavedObjectLen(val,NULL)));
10007 } else {
10008 addReplySds(c,sdscatprintf(sdsempty(),
10009 "+Key at:%p refcount:%d, value swapped at: page %llu "
10010 "using %llu pages\r\n",
10011 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10012 (unsigned long long) key->vm.usedpages));
10013 }
10014 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10015 lookupKeyRead(c->db,c->argv[2]);
10016 addReply(c,shared.ok);
10017 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10018 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10019 robj *key, *val;
10020
10021 if (!server.vm_enabled) {
10022 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10023 return;
10024 }
10025 if (!de) {
10026 addReply(c,shared.nokeyerr);
10027 return;
10028 }
10029 key = dictGetEntryKey(de);
10030 val = dictGetEntryVal(de);
10031 /* If the key is shared we want to create a copy */
10032 if (key->refcount > 1) {
10033 robj *newkey = dupStringObject(key);
10034 decrRefCount(key);
10035 key = dictGetEntryKey(de) = newkey;
10036 }
10037 /* Swap it */
10038 if (key->storage != REDIS_VM_MEMORY) {
10039 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
10040 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
10041 dictGetEntryVal(de) = NULL;
10042 addReply(c,shared.ok);
10043 } else {
10044 addReply(c,shared.err);
10045 }
10046 } else {
10047 addReplySds(c,sdsnew(
10048 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
10049 }
10050 }
10051
10052 static void _redisAssert(char *estr, char *file, int line) {
10053 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
10054 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
10055 #ifdef HAVE_BACKTRACE
10056 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10057 *((char*)-1) = 'x';
10058 #endif
10059 }
10060
10061 static void _redisPanic(char *msg, char *file, int line) {
10062 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
10063 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
10064 #ifdef HAVE_BACKTRACE
10065 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10066 *((char*)-1) = 'x';
10067 #endif
10068 }
10069
10070 /* =================================== Main! ================================ */
10071
10072 #ifdef __linux__
10073 int linuxOvercommitMemoryValue(void) {
10074 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10075 char buf[64];
10076
10077 if (!fp) return -1;
10078 if (fgets(buf,64,fp) == NULL) {
10079 fclose(fp);
10080 return -1;
10081 }
10082 fclose(fp);
10083
10084 return atoi(buf);
10085 }
10086
10087 void linuxOvercommitMemoryWarning(void) {
10088 if (linuxOvercommitMemoryValue() == 0) {
10089 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10090 }
10091 }
10092 #endif /* __linux__ */
10093
10094 static void daemonize(void) {
10095 int fd;
10096 FILE *fp;
10097
10098 if (fork() != 0) exit(0); /* parent exits */
10099 setsid(); /* create a new session */
10100
10101 /* Every output goes to /dev/null. If Redis is daemonized but
10102 * the 'logfile' is set to 'stdout' in the configuration file
10103 * it will not log at all. */
10104 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10105 dup2(fd, STDIN_FILENO);
10106 dup2(fd, STDOUT_FILENO);
10107 dup2(fd, STDERR_FILENO);
10108 if (fd > STDERR_FILENO) close(fd);
10109 }
10110 /* Try to write the pid file */
10111 fp = fopen(server.pidfile,"w");
10112 if (fp) {
10113 fprintf(fp,"%d\n",getpid());
10114 fclose(fp);
10115 }
10116 }
10117
10118 static void version() {
10119 printf("Redis server version %s\n", REDIS_VERSION);
10120 exit(0);
10121 }
10122
10123 static void usage() {
10124 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10125 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10126 exit(1);
10127 }
10128
10129 int main(int argc, char **argv) {
10130 time_t start;
10131
10132 initServerConfig();
10133 if (argc == 2) {
10134 if (strcmp(argv[1], "-v") == 0 ||
10135 strcmp(argv[1], "--version") == 0) version();
10136 if (strcmp(argv[1], "--help") == 0) usage();
10137 resetServerSaveParams();
10138 loadServerConfig(argv[1]);
10139 } else if ((argc > 2)) {
10140 usage();
10141 } else {
10142 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10143 }
10144 if (server.daemonize) daemonize();
10145 initServer();
10146 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10147 #ifdef __linux__
10148 linuxOvercommitMemoryWarning();
10149 #endif
10150 start = time(NULL);
10151 if (server.appendonly) {
10152 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10153 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10154 } else {
10155 if (rdbLoad(server.dbfilename) == REDIS_OK)
10156 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10157 }
10158 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10159 aeSetBeforeSleepProc(server.el,beforeSleep);
10160 aeMain(server.el);
10161 aeDeleteEventLoop(server.el);
10162 return 0;
10163 }
10164
10165 /* ============================= Backtrace support ========================= */
10166
10167 #ifdef HAVE_BACKTRACE
10168 static char *findFuncName(void *pointer, unsigned long *offset);
10169
10170 static void *getMcontextEip(ucontext_t *uc) {
10171 #if defined(__FreeBSD__)
10172 return (void*) uc->uc_mcontext.mc_eip;
10173 #elif defined(__dietlibc__)
10174 return (void*) uc->uc_mcontext.eip;
10175 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10176 #if __x86_64__
10177 return (void*) uc->uc_mcontext->__ss.__rip;
10178 #else
10179 return (void*) uc->uc_mcontext->__ss.__eip;
10180 #endif
10181 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10182 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10183 return (void*) uc->uc_mcontext->__ss.__rip;
10184 #else
10185 return (void*) uc->uc_mcontext->__ss.__eip;
10186 #endif
10187 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10188 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10189 #elif defined(__ia64__) /* Linux IA64 */
10190 return (void*) uc->uc_mcontext.sc_ip;
10191 #else
10192 return NULL;
10193 #endif
10194 }
10195
10196 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10197 void *trace[100];
10198 char **messages = NULL;
10199 int i, trace_size = 0;
10200 unsigned long offset=0;
10201 ucontext_t *uc = (ucontext_t*) secret;
10202 sds infostring;
10203 REDIS_NOTUSED(info);
10204
10205 redisLog(REDIS_WARNING,
10206 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10207 infostring = genRedisInfoString();
10208 redisLog(REDIS_WARNING, "%s",infostring);
10209 /* It's not safe to sdsfree() the returned string under memory
10210 * corruption conditions. Let it leak as we are going to abort */
10211
10212 trace_size = backtrace(trace, 100);
10213 /* overwrite sigaction with caller's address */
10214 if (getMcontextEip(uc) != NULL) {
10215 trace[1] = getMcontextEip(uc);
10216 }
10217 messages = backtrace_symbols(trace, trace_size);
10218
10219 for (i=1; i<trace_size; ++i) {
10220 char *fn = findFuncName(trace[i], &offset), *p;
10221
10222 p = strchr(messages[i],'+');
10223 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10224 redisLog(REDIS_WARNING,"%s", messages[i]);
10225 } else {
10226 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10227 }
10228 }
10229 /* free(messages); Don't call free() with possibly corrupted memory. */
10230 _exit(0);
10231 }
10232
10233 static void setupSigSegvAction(void) {
10234 struct sigaction act;
10235
10236 sigemptyset (&act.sa_mask);
10237 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10238 * is used. Otherwise, sa_handler is used */
10239 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10240 act.sa_sigaction = segvHandler;
10241 sigaction (SIGSEGV, &act, NULL);
10242 sigaction (SIGBUS, &act, NULL);
10243 sigaction (SIGFPE, &act, NULL);
10244 sigaction (SIGILL, &act, NULL);
10245 sigaction (SIGBUS, &act, NULL);
10246 return;
10247 }
10248
10249 #include "staticsymbols.h"
10250 /* This function try to convert a pointer into a function name. It's used in
10251 * oreder to provide a backtrace under segmentation fault that's able to
10252 * display functions declared as static (otherwise the backtrace is useless). */
10253 static char *findFuncName(void *pointer, unsigned long *offset){
10254 int i, ret = -1;
10255 unsigned long off, minoff = 0;
10256
10257 /* Try to match against the Symbol with the smallest offset */
10258 for (i=0; symsTable[i].pointer; i++) {
10259 unsigned long lp = (unsigned long) pointer;
10260
10261 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10262 off=lp-symsTable[i].pointer;
10263 if (ret < 0 || off < minoff) {
10264 minoff=off;
10265 ret=i;
10266 }
10267 }
10268 }
10269 if (ret == -1) return NULL;
10270 *offset = minoff;
10271 return symsTable[ret].name;
10272 }
10273 #else /* HAVE_BACKTRACE */
10274 static void setupSigSegvAction(void) {
10275 }
10276 #endif /* HAVE_BACKTRACE */
10277
10278
10279
10280 /* The End */
10281
10282
10283