]> git.saurik.com Git - redis.git/blob - redis.c
typo fixed, reloaded
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.10"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
241 static void _redisAssert(char *estr, char *file, int line);
242 static void _redisPanic(char *msg, char *file, int line);
243
244 /*================================= Data types ============================== */
245
246 /* A redis object, that is a type able to hold a string / list / set */
247
248 /* The VM object structure */
249 struct redisObjectVM {
250 off_t page; /* the page at witch the object is stored on disk */
251 off_t usedpages; /* number of pages used on disk */
252 time_t atime; /* Last access time */
253 } vm;
254
255 /* The actual Redis Object */
256 typedef struct redisObject {
257 void *ptr;
258 unsigned char type;
259 unsigned char encoding;
260 unsigned char storage; /* If this object is a key, where is the value?
261 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
262 unsigned char vtype; /* If this object is a key, and value is swapped out,
263 * this is the type of the swapped out object. */
264 int refcount;
265 /* VM fields, this are only allocated if VM is active, otherwise the
266 * object allocation function will just allocate
267 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
268 * Redis without VM active will not have any overhead. */
269 struct redisObjectVM vm;
270 } robj;
271
272 /* Macro used to initalize a Redis object allocated on the stack.
273 * Note that this macro is taken near the structure definition to make sure
274 * we'll update it when the structure is changed, to avoid bugs like
275 * bug #85 introduced exactly in this way. */
276 #define initStaticStringObject(_var,_ptr) do { \
277 _var.refcount = 1; \
278 _var.type = REDIS_STRING; \
279 _var.encoding = REDIS_ENCODING_RAW; \
280 _var.ptr = _ptr; \
281 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
282 } while(0);
283
284 typedef struct redisDb {
285 dict *dict; /* The keyspace for this DB */
286 dict *expires; /* Timeout of keys with a timeout set */
287 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
288 dict *io_keys; /* Keys with clients waiting for VM I/O */
289 int id;
290 } redisDb;
291
292 /* Client MULTI/EXEC state */
293 typedef struct multiCmd {
294 robj **argv;
295 int argc;
296 struct redisCommand *cmd;
297 } multiCmd;
298
299 typedef struct multiState {
300 multiCmd *commands; /* Array of MULTI commands */
301 int count; /* Total number of MULTI commands */
302 } multiState;
303
304 /* With multiplexing we need to take per-clinet state.
305 * Clients are taken in a liked list. */
306 typedef struct redisClient {
307 int fd;
308 redisDb *db;
309 int dictid;
310 sds querybuf;
311 robj **argv, **mbargv;
312 int argc, mbargc;
313 int bulklen; /* bulk read len. -1 if not in bulk read mode */
314 int multibulk; /* multi bulk command format active */
315 list *reply;
316 int sentlen;
317 time_t lastinteraction; /* time of the last interaction, used for timeout */
318 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
319 int slaveseldb; /* slave selected db, if this client is a slave */
320 int authenticated; /* when requirepass is non-NULL */
321 int replstate; /* replication state if this is a slave */
322 int repldbfd; /* replication DB file descriptor */
323 long repldboff; /* replication DB file offset */
324 off_t repldbsize; /* replication DB file size */
325 multiState mstate; /* MULTI/EXEC state */
326 robj **blockingkeys; /* The key we are waiting to terminate a blocking
327 * operation such as BLPOP. Otherwise NULL. */
328 int blockingkeysnum; /* Number of blocking keys */
329 time_t blockingto; /* Blocking operation timeout. If UNIX current time
330 * is >= blockingto then the operation timed out. */
331 list *io_keys; /* Keys this client is waiting to be loaded from the
332 * swap file in order to continue. */
333 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
334 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
335 } redisClient;
336
337 struct saveparam {
338 time_t seconds;
339 int changes;
340 };
341
342 /* Global server state structure */
343 struct redisServer {
344 int port;
345 int fd;
346 redisDb *db;
347 long long dirty; /* changes to DB from the last save */
348 list *clients;
349 list *slaves, *monitors;
350 char neterr[ANET_ERR_LEN];
351 aeEventLoop *el;
352 int cronloops; /* number of times the cron function run */
353 list *objfreelist; /* A list of freed objects to avoid malloc() */
354 time_t lastsave; /* Unix time of last save succeeede */
355 /* Fields used only for stats */
356 time_t stat_starttime; /* server start time */
357 long long stat_numcommands; /* number of processed commands */
358 long long stat_numconnections; /* number of connections received */
359 long long stat_expiredkeys; /* number of expired keys */
360 /* Configuration */
361 int verbosity;
362 int glueoutputbuf;
363 int maxidletime;
364 int dbnum;
365 int daemonize;
366 int appendonly;
367 int appendfsync;
368 time_t lastfsync;
369 int appendfd;
370 int appendseldb;
371 char *pidfile;
372 pid_t bgsavechildpid;
373 pid_t bgrewritechildpid;
374 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
375 struct saveparam *saveparams;
376 int saveparamslen;
377 char *logfile;
378 char *bindaddr;
379 char *dbfilename;
380 char *appendfilename;
381 char *requirepass;
382 int rdbcompression;
383 int activerehashing;
384 /* Replication related */
385 int isslave;
386 char *masterauth;
387 char *masterhost;
388 int masterport;
389 redisClient *master; /* client that is master for this slave */
390 int replstate;
391 unsigned int maxclients;
392 unsigned long long maxmemory;
393 unsigned int blpop_blocked_clients;
394 unsigned int vm_blocked_clients;
395 /* Sort parameters - qsort_r() is only available under BSD so we
396 * have to take this state global, in order to pass it to sortCompare() */
397 int sort_desc;
398 int sort_alpha;
399 int sort_bypattern;
400 /* Virtual memory configuration */
401 int vm_enabled;
402 char *vm_swap_file;
403 off_t vm_page_size;
404 off_t vm_pages;
405 unsigned long long vm_max_memory;
406 /* Hashes config */
407 size_t hash_max_zipmap_entries;
408 size_t hash_max_zipmap_value;
409 /* Virtual memory state */
410 FILE *vm_fp;
411 int vm_fd;
412 off_t vm_next_page; /* Next probably empty page */
413 off_t vm_near_pages; /* Number of pages allocated sequentially */
414 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
415 time_t unixtime; /* Unix time sampled every second. */
416 /* Virtual memory I/O threads stuff */
417 /* An I/O thread process an element taken from the io_jobs queue and
418 * put the result of the operation in the io_done list. While the
419 * job is being processed, it's put on io_processing queue. */
420 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
421 list *io_processing; /* List of VM I/O jobs being processed */
422 list *io_processed; /* List of VM I/O jobs already processed */
423 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
424 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
425 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
426 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
427 pthread_attr_t io_threads_attr; /* attributes for threads creation */
428 int io_active_threads; /* Number of running I/O threads */
429 int vm_max_threads; /* Max number of I/O threads running at the same time */
430 /* Our main thread is blocked on the event loop, locking for sockets ready
431 * to be read or written, so when a threaded I/O operation is ready to be
432 * processed by the main thread, the I/O thread will use a unix pipe to
433 * awake the main thread. The followings are the two pipe FDs. */
434 int io_ready_pipe_read;
435 int io_ready_pipe_write;
436 /* Virtual memory stats */
437 unsigned long long vm_stats_used_pages;
438 unsigned long long vm_stats_swapped_objects;
439 unsigned long long vm_stats_swapouts;
440 unsigned long long vm_stats_swapins;
441 /* Pubsub */
442 dict *pubsub_channels; /* Map channels to list of subscribed clients */
443 list *pubsub_patterns; /* A list of pubsub_patterns */
444 /* Misc */
445 FILE *devnull;
446 };
447
448 typedef struct pubsubPattern {
449 redisClient *client;
450 robj *pattern;
451 } pubsubPattern;
452
453 typedef void redisCommandProc(redisClient *c);
454 struct redisCommand {
455 char *name;
456 redisCommandProc *proc;
457 int arity;
458 int flags;
459 /* Use a function to determine which keys need to be loaded
460 * in the background prior to executing this command. Takes precedence
461 * over vm_firstkey and others, ignored when NULL */
462 redisCommandProc *vm_preload_proc;
463 /* What keys should be loaded in background when calling this command? */
464 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
465 int vm_lastkey; /* THe last argument that's a key */
466 int vm_keystep; /* The step between first and last key */
467 };
468
469 struct redisFunctionSym {
470 char *name;
471 unsigned long pointer;
472 };
473
474 typedef struct _redisSortObject {
475 robj *obj;
476 union {
477 double score;
478 robj *cmpobj;
479 } u;
480 } redisSortObject;
481
482 typedef struct _redisSortOperation {
483 int type;
484 robj *pattern;
485 } redisSortOperation;
486
487 /* ZSETs use a specialized version of Skiplists */
488
489 typedef struct zskiplistNode {
490 struct zskiplistNode **forward;
491 struct zskiplistNode *backward;
492 unsigned int *span;
493 double score;
494 robj *obj;
495 } zskiplistNode;
496
497 typedef struct zskiplist {
498 struct zskiplistNode *header, *tail;
499 unsigned long length;
500 int level;
501 } zskiplist;
502
503 typedef struct zset {
504 dict *dict;
505 zskiplist *zsl;
506 } zset;
507
508 /* Our shared "common" objects */
509
510 #define REDIS_SHARED_INTEGERS 10000
511 struct sharedObjectsStruct {
512 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
513 *colon, *nullbulk, *nullmultibulk, *queued,
514 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
515 *outofrangeerr, *plus,
516 *select0, *select1, *select2, *select3, *select4,
517 *select5, *select6, *select7, *select8, *select9,
518 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
519 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
520 } shared;
521
522 /* Global vars that are actally used as constants. The following double
523 * values are used for double on-disk serialization, and are initialized
524 * at runtime to avoid strange compiler optimizations. */
525
526 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
527
528 /* VM threaded I/O request message */
529 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
530 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
531 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
532 typedef struct iojob {
533 int type; /* Request type, REDIS_IOJOB_* */
534 redisDb *db;/* Redis database */
535 robj *key; /* This I/O request is about swapping this key */
536 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
537 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
538 off_t page; /* Swap page where to read/write the object */
539 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
540 int canceled; /* True if this command was canceled by blocking side of VM */
541 pthread_t thread; /* ID of the thread processing this entry */
542 } iojob;
543
544 /*================================ Prototypes =============================== */
545
546 static void freeStringObject(robj *o);
547 static void freeListObject(robj *o);
548 static void freeSetObject(robj *o);
549 static void decrRefCount(void *o);
550 static robj *createObject(int type, void *ptr);
551 static void freeClient(redisClient *c);
552 static int rdbLoad(char *filename);
553 static void addReply(redisClient *c, robj *obj);
554 static void addReplySds(redisClient *c, sds s);
555 static void incrRefCount(robj *o);
556 static int rdbSaveBackground(char *filename);
557 static robj *createStringObject(char *ptr, size_t len);
558 static robj *dupStringObject(robj *o);
559 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
560 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
561 static int syncWithMaster(void);
562 static robj *tryObjectEncoding(robj *o);
563 static robj *getDecodedObject(robj *o);
564 static int removeExpire(redisDb *db, robj *key);
565 static int expireIfNeeded(redisDb *db, robj *key);
566 static int deleteIfVolatile(redisDb *db, robj *key);
567 static int deleteIfSwapped(redisDb *db, robj *key);
568 static int deleteKey(redisDb *db, robj *key);
569 static time_t getExpire(redisDb *db, robj *key);
570 static int setExpire(redisDb *db, robj *key, time_t when);
571 static void updateSlavesWaitingBgsave(int bgsaveerr);
572 static void freeMemoryIfNeeded(void);
573 static int processCommand(redisClient *c);
574 static void setupSigSegvAction(void);
575 static void rdbRemoveTempFile(pid_t childpid);
576 static void aofRemoveTempFile(pid_t childpid);
577 static size_t stringObjectLen(robj *o);
578 static void processInputBuffer(redisClient *c);
579 static zskiplist *zslCreate(void);
580 static void zslFree(zskiplist *zsl);
581 static void zslInsert(zskiplist *zsl, double score, robj *obj);
582 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
583 static void initClientMultiState(redisClient *c);
584 static void freeClientMultiState(redisClient *c);
585 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
586 static void unblockClientWaitingData(redisClient *c);
587 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
588 static void vmInit(void);
589 static void vmMarkPagesFree(off_t page, off_t count);
590 static robj *vmLoadObject(robj *key);
591 static robj *vmPreviewObject(robj *key);
592 static int vmSwapOneObjectBlocking(void);
593 static int vmSwapOneObjectThreaded(void);
594 static int vmCanSwapOut(void);
595 static int tryFreeOneObjectFromFreelist(void);
596 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
597 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
598 static void vmCancelThreadedIOJob(robj *o);
599 static void lockThreadedIO(void);
600 static void unlockThreadedIO(void);
601 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
602 static void freeIOJob(iojob *j);
603 static void queueIOJob(iojob *j);
604 static int vmWriteObjectOnSwap(robj *o, off_t page);
605 static robj *vmReadObjectFromSwap(off_t page, int type);
606 static void waitEmptyIOJobsQueue(void);
607 static void vmReopenSwapFile(void);
608 static int vmFreePage(off_t page);
609 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
610 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
611 static int dontWaitForSwappedKey(redisClient *c, robj *key);
612 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
613 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
614 static struct redisCommand *lookupCommand(char *name);
615 static void call(redisClient *c, struct redisCommand *cmd);
616 static void resetClient(redisClient *c);
617 static void convertToRealHash(robj *o);
618 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
619 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
620 static void freePubsubPattern(void *p);
621 static int listMatchPubsubPattern(void *a, void *b);
622 static int compareStringObjects(robj *a, robj *b);
623 static void usage();
624
625 static void authCommand(redisClient *c);
626 static void pingCommand(redisClient *c);
627 static void echoCommand(redisClient *c);
628 static void setCommand(redisClient *c);
629 static void setnxCommand(redisClient *c);
630 static void getCommand(redisClient *c);
631 static void delCommand(redisClient *c);
632 static void existsCommand(redisClient *c);
633 static void incrCommand(redisClient *c);
634 static void decrCommand(redisClient *c);
635 static void incrbyCommand(redisClient *c);
636 static void decrbyCommand(redisClient *c);
637 static void selectCommand(redisClient *c);
638 static void randomkeyCommand(redisClient *c);
639 static void keysCommand(redisClient *c);
640 static void dbsizeCommand(redisClient *c);
641 static void lastsaveCommand(redisClient *c);
642 static void saveCommand(redisClient *c);
643 static void bgsaveCommand(redisClient *c);
644 static void bgrewriteaofCommand(redisClient *c);
645 static void shutdownCommand(redisClient *c);
646 static void moveCommand(redisClient *c);
647 static void renameCommand(redisClient *c);
648 static void renamenxCommand(redisClient *c);
649 static void lpushCommand(redisClient *c);
650 static void rpushCommand(redisClient *c);
651 static void lpopCommand(redisClient *c);
652 static void rpopCommand(redisClient *c);
653 static void llenCommand(redisClient *c);
654 static void lindexCommand(redisClient *c);
655 static void lrangeCommand(redisClient *c);
656 static void ltrimCommand(redisClient *c);
657 static void typeCommand(redisClient *c);
658 static void lsetCommand(redisClient *c);
659 static void saddCommand(redisClient *c);
660 static void sremCommand(redisClient *c);
661 static void smoveCommand(redisClient *c);
662 static void sismemberCommand(redisClient *c);
663 static void scardCommand(redisClient *c);
664 static void spopCommand(redisClient *c);
665 static void srandmemberCommand(redisClient *c);
666 static void sinterCommand(redisClient *c);
667 static void sinterstoreCommand(redisClient *c);
668 static void sunionCommand(redisClient *c);
669 static void sunionstoreCommand(redisClient *c);
670 static void sdiffCommand(redisClient *c);
671 static void sdiffstoreCommand(redisClient *c);
672 static void syncCommand(redisClient *c);
673 static void flushdbCommand(redisClient *c);
674 static void flushallCommand(redisClient *c);
675 static void sortCommand(redisClient *c);
676 static void lremCommand(redisClient *c);
677 static void rpoplpushcommand(redisClient *c);
678 static void infoCommand(redisClient *c);
679 static void mgetCommand(redisClient *c);
680 static void monitorCommand(redisClient *c);
681 static void expireCommand(redisClient *c);
682 static void expireatCommand(redisClient *c);
683 static void getsetCommand(redisClient *c);
684 static void ttlCommand(redisClient *c);
685 static void slaveofCommand(redisClient *c);
686 static void debugCommand(redisClient *c);
687 static void msetCommand(redisClient *c);
688 static void msetnxCommand(redisClient *c);
689 static void zaddCommand(redisClient *c);
690 static void zincrbyCommand(redisClient *c);
691 static void zrangeCommand(redisClient *c);
692 static void zrangebyscoreCommand(redisClient *c);
693 static void zcountCommand(redisClient *c);
694 static void zrevrangeCommand(redisClient *c);
695 static void zcardCommand(redisClient *c);
696 static void zremCommand(redisClient *c);
697 static void zscoreCommand(redisClient *c);
698 static void zremrangebyscoreCommand(redisClient *c);
699 static void multiCommand(redisClient *c);
700 static void execCommand(redisClient *c);
701 static void discardCommand(redisClient *c);
702 static void blpopCommand(redisClient *c);
703 static void brpopCommand(redisClient *c);
704 static void appendCommand(redisClient *c);
705 static void substrCommand(redisClient *c);
706 static void zrankCommand(redisClient *c);
707 static void zrevrankCommand(redisClient *c);
708 static void hsetCommand(redisClient *c);
709 static void hsetnxCommand(redisClient *c);
710 static void hgetCommand(redisClient *c);
711 static void hmsetCommand(redisClient *c);
712 static void hmgetCommand(redisClient *c);
713 static void hdelCommand(redisClient *c);
714 static void hlenCommand(redisClient *c);
715 static void zremrangebyrankCommand(redisClient *c);
716 static void zunionCommand(redisClient *c);
717 static void zinterCommand(redisClient *c);
718 static void hkeysCommand(redisClient *c);
719 static void hvalsCommand(redisClient *c);
720 static void hgetallCommand(redisClient *c);
721 static void hexistsCommand(redisClient *c);
722 static void configCommand(redisClient *c);
723 static void hincrbyCommand(redisClient *c);
724 static void subscribeCommand(redisClient *c);
725 static void unsubscribeCommand(redisClient *c);
726 static void psubscribeCommand(redisClient *c);
727 static void punsubscribeCommand(redisClient *c);
728 static void publishCommand(redisClient *c);
729
730 /*================================= Globals ================================= */
731
732 /* Global vars */
733 static struct redisServer server; /* server global state */
734 static struct redisCommand cmdTable[] = {
735 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
737 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
738 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
739 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
740 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
741 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
742 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
743 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
745 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
747 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
754 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
757 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
758 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
759 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
760 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
761 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
762 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
766 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
767 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
768 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
769 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
770 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
771 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
773 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
774 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
775 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
778 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
779 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
782 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
785 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
787 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
788 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
789 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
791 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
792 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
794 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
797 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
801 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
802 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
803 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
804 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
805 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
816 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
822 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
824 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
829 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
835 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
840 {NULL,NULL,0,0,NULL,0,0,0}
841 };
842
843 /*============================ Utility functions ============================ */
844
845 /* Glob-style pattern matching. */
846 static int stringmatchlen(const char *pattern, int patternLen,
847 const char *string, int stringLen, int nocase)
848 {
849 while(patternLen) {
850 switch(pattern[0]) {
851 case '*':
852 while (pattern[1] == '*') {
853 pattern++;
854 patternLen--;
855 }
856 if (patternLen == 1)
857 return 1; /* match */
858 while(stringLen) {
859 if (stringmatchlen(pattern+1, patternLen-1,
860 string, stringLen, nocase))
861 return 1; /* match */
862 string++;
863 stringLen--;
864 }
865 return 0; /* no match */
866 break;
867 case '?':
868 if (stringLen == 0)
869 return 0; /* no match */
870 string++;
871 stringLen--;
872 break;
873 case '[':
874 {
875 int not, match;
876
877 pattern++;
878 patternLen--;
879 not = pattern[0] == '^';
880 if (not) {
881 pattern++;
882 patternLen--;
883 }
884 match = 0;
885 while(1) {
886 if (pattern[0] == '\\') {
887 pattern++;
888 patternLen--;
889 if (pattern[0] == string[0])
890 match = 1;
891 } else if (pattern[0] == ']') {
892 break;
893 } else if (patternLen == 0) {
894 pattern--;
895 patternLen++;
896 break;
897 } else if (pattern[1] == '-' && patternLen >= 3) {
898 int start = pattern[0];
899 int end = pattern[2];
900 int c = string[0];
901 if (start > end) {
902 int t = start;
903 start = end;
904 end = t;
905 }
906 if (nocase) {
907 start = tolower(start);
908 end = tolower(end);
909 c = tolower(c);
910 }
911 pattern += 2;
912 patternLen -= 2;
913 if (c >= start && c <= end)
914 match = 1;
915 } else {
916 if (!nocase) {
917 if (pattern[0] == string[0])
918 match = 1;
919 } else {
920 if (tolower((int)pattern[0]) == tolower((int)string[0]))
921 match = 1;
922 }
923 }
924 pattern++;
925 patternLen--;
926 }
927 if (not)
928 match = !match;
929 if (!match)
930 return 0; /* no match */
931 string++;
932 stringLen--;
933 break;
934 }
935 case '\\':
936 if (patternLen >= 2) {
937 pattern++;
938 patternLen--;
939 }
940 /* fall through */
941 default:
942 if (!nocase) {
943 if (pattern[0] != string[0])
944 return 0; /* no match */
945 } else {
946 if (tolower((int)pattern[0]) != tolower((int)string[0]))
947 return 0; /* no match */
948 }
949 string++;
950 stringLen--;
951 break;
952 }
953 pattern++;
954 patternLen--;
955 if (stringLen == 0) {
956 while(*pattern == '*') {
957 pattern++;
958 patternLen--;
959 }
960 break;
961 }
962 }
963 if (patternLen == 0 && stringLen == 0)
964 return 1;
965 return 0;
966 }
967
968 static int stringmatch(const char *pattern, const char *string, int nocase) {
969 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
970 }
971
972 /* Convert a string representing an amount of memory into the number of
973 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
974 * (1024*1024*1024).
975 *
976 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
977 * set to 0 */
978 static long long memtoll(const char *p, int *err) {
979 const char *u;
980 char buf[128];
981 long mul; /* unit multiplier */
982 long long val;
983 unsigned int digits;
984
985 if (err) *err = 0;
986 /* Search the first non digit character. */
987 u = p;
988 if (*u == '-') u++;
989 while(*u && isdigit(*u)) u++;
990 if (*u == '\0' || !strcasecmp(u,"b")) {
991 mul = 1;
992 } else if (!strcasecmp(u,"k")) {
993 mul = 1000;
994 } else if (!strcasecmp(u,"kb")) {
995 mul = 1024;
996 } else if (!strcasecmp(u,"m")) {
997 mul = 1000*1000;
998 } else if (!strcasecmp(u,"mb")) {
999 mul = 1024*1024;
1000 } else if (!strcasecmp(u,"g")) {
1001 mul = 1000L*1000*1000;
1002 } else if (!strcasecmp(u,"gb")) {
1003 mul = 1024L*1024*1024;
1004 } else {
1005 if (err) *err = 1;
1006 mul = 1;
1007 }
1008 digits = u-p;
1009 if (digits >= sizeof(buf)) {
1010 if (err) *err = 1;
1011 return LLONG_MAX;
1012 }
1013 memcpy(buf,p,digits);
1014 buf[digits] = '\0';
1015 val = strtoll(buf,NULL,10);
1016 return val*mul;
1017 }
1018
1019 static void redisLog(int level, const char *fmt, ...) {
1020 va_list ap;
1021 FILE *fp;
1022
1023 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1024 if (!fp) return;
1025
1026 va_start(ap, fmt);
1027 if (level >= server.verbosity) {
1028 char *c = ".-*#";
1029 char buf[64];
1030 time_t now;
1031
1032 now = time(NULL);
1033 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1034 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
1035 vfprintf(fp, fmt, ap);
1036 fprintf(fp,"\n");
1037 fflush(fp);
1038 }
1039 va_end(ap);
1040
1041 if (server.logfile) fclose(fp);
1042 }
1043
1044 /*====================== Hash table type implementation ==================== */
1045
1046 /* This is an hash table type that uses the SDS dynamic strings libary as
1047 * keys and radis objects as values (objects can hold SDS strings,
1048 * lists, sets). */
1049
1050 static void dictVanillaFree(void *privdata, void *val)
1051 {
1052 DICT_NOTUSED(privdata);
1053 zfree(val);
1054 }
1055
1056 static void dictListDestructor(void *privdata, void *val)
1057 {
1058 DICT_NOTUSED(privdata);
1059 listRelease((list*)val);
1060 }
1061
1062 static int sdsDictKeyCompare(void *privdata, const void *key1,
1063 const void *key2)
1064 {
1065 int l1,l2;
1066 DICT_NOTUSED(privdata);
1067
1068 l1 = sdslen((sds)key1);
1069 l2 = sdslen((sds)key2);
1070 if (l1 != l2) return 0;
1071 return memcmp(key1, key2, l1) == 0;
1072 }
1073
1074 static void dictRedisObjectDestructor(void *privdata, void *val)
1075 {
1076 DICT_NOTUSED(privdata);
1077
1078 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1079 decrRefCount(val);
1080 }
1081
1082 static int dictObjKeyCompare(void *privdata, const void *key1,
1083 const void *key2)
1084 {
1085 const robj *o1 = key1, *o2 = key2;
1086 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1087 }
1088
1089 static unsigned int dictObjHash(const void *key) {
1090 const robj *o = key;
1091 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1092 }
1093
1094 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1095 const void *key2)
1096 {
1097 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1098 int cmp;
1099
1100 if (o1->encoding == REDIS_ENCODING_INT &&
1101 o2->encoding == REDIS_ENCODING_INT &&
1102 o1->ptr == o2->ptr) return 1;
1103
1104 o1 = getDecodedObject(o1);
1105 o2 = getDecodedObject(o2);
1106 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1107 decrRefCount(o1);
1108 decrRefCount(o2);
1109 return cmp;
1110 }
1111
1112 static unsigned int dictEncObjHash(const void *key) {
1113 robj *o = (robj*) key;
1114
1115 if (o->encoding == REDIS_ENCODING_RAW) {
1116 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1117 } else {
1118 if (o->encoding == REDIS_ENCODING_INT) {
1119 char buf[32];
1120 int len;
1121
1122 len = snprintf(buf,32,"%ld",(long)o->ptr);
1123 return dictGenHashFunction((unsigned char*)buf, len);
1124 } else {
1125 unsigned int hash;
1126
1127 o = getDecodedObject(o);
1128 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1129 decrRefCount(o);
1130 return hash;
1131 }
1132 }
1133 }
1134
1135 /* Sets type and expires */
1136 static dictType setDictType = {
1137 dictEncObjHash, /* hash function */
1138 NULL, /* key dup */
1139 NULL, /* val dup */
1140 dictEncObjKeyCompare, /* key compare */
1141 dictRedisObjectDestructor, /* key destructor */
1142 NULL /* val destructor */
1143 };
1144
1145 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1146 static dictType zsetDictType = {
1147 dictEncObjHash, /* hash function */
1148 NULL, /* key dup */
1149 NULL, /* val dup */
1150 dictEncObjKeyCompare, /* key compare */
1151 dictRedisObjectDestructor, /* key destructor */
1152 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1153 };
1154
1155 /* Db->dict */
1156 static dictType dbDictType = {
1157 dictObjHash, /* hash function */
1158 NULL, /* key dup */
1159 NULL, /* val dup */
1160 dictObjKeyCompare, /* key compare */
1161 dictRedisObjectDestructor, /* key destructor */
1162 dictRedisObjectDestructor /* val destructor */
1163 };
1164
1165 /* Db->expires */
1166 static dictType keyptrDictType = {
1167 dictObjHash, /* hash function */
1168 NULL, /* key dup */
1169 NULL, /* val dup */
1170 dictObjKeyCompare, /* key compare */
1171 dictRedisObjectDestructor, /* key destructor */
1172 NULL /* val destructor */
1173 };
1174
1175 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1176 static dictType hashDictType = {
1177 dictEncObjHash, /* hash function */
1178 NULL, /* key dup */
1179 NULL, /* val dup */
1180 dictEncObjKeyCompare, /* key compare */
1181 dictRedisObjectDestructor, /* key destructor */
1182 dictRedisObjectDestructor /* val destructor */
1183 };
1184
1185 /* Keylist hash table type has unencoded redis objects as keys and
1186 * lists as values. It's used for blocking operations (BLPOP) and to
1187 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1188 static dictType keylistDictType = {
1189 dictObjHash, /* hash function */
1190 NULL, /* key dup */
1191 NULL, /* val dup */
1192 dictObjKeyCompare, /* key compare */
1193 dictRedisObjectDestructor, /* key destructor */
1194 dictListDestructor /* val destructor */
1195 };
1196
1197 static void version();
1198
1199 /* ========================= Random utility functions ======================= */
1200
1201 /* Redis generally does not try to recover from out of memory conditions
1202 * when allocating objects or strings, it is not clear if it will be possible
1203 * to report this condition to the client since the networking layer itself
1204 * is based on heap allocation for send buffers, so we simply abort.
1205 * At least the code will be simpler to read... */
1206 static void oom(const char *msg) {
1207 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1208 sleep(1);
1209 abort();
1210 }
1211
1212 /* ====================== Redis server networking stuff ===================== */
1213 static void closeTimedoutClients(void) {
1214 redisClient *c;
1215 listNode *ln;
1216 time_t now = time(NULL);
1217 listIter li;
1218
1219 listRewind(server.clients,&li);
1220 while ((ln = listNext(&li)) != NULL) {
1221 c = listNodeValue(ln);
1222 if (server.maxidletime &&
1223 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1224 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1225 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1226 listLength(c->pubsub_patterns) == 0 &&
1227 (now - c->lastinteraction > server.maxidletime))
1228 {
1229 redisLog(REDIS_VERBOSE,"Closing idle client");
1230 freeClient(c);
1231 } else if (c->flags & REDIS_BLOCKED) {
1232 if (c->blockingto != 0 && c->blockingto < now) {
1233 addReply(c,shared.nullmultibulk);
1234 unblockClientWaitingData(c);
1235 }
1236 }
1237 }
1238 }
1239
1240 static int htNeedsResize(dict *dict) {
1241 long long size, used;
1242
1243 size = dictSlots(dict);
1244 used = dictSize(dict);
1245 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1246 (used*100/size < REDIS_HT_MINFILL));
1247 }
1248
1249 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1250 * we resize the hash table to save memory */
1251 static void tryResizeHashTables(void) {
1252 int j;
1253
1254 for (j = 0; j < server.dbnum; j++) {
1255 if (htNeedsResize(server.db[j].dict))
1256 dictResize(server.db[j].dict);
1257 if (htNeedsResize(server.db[j].expires))
1258 dictResize(server.db[j].expires);
1259 }
1260 }
1261
1262 /* Our hash table implementation performs rehashing incrementally while
1263 * we write/read from the hash table. Still if the server is idle, the hash
1264 * table will use two tables for a long time. So we try to use 1 millisecond
1265 * of CPU time at every serverCron() loop in order to rehash some key. */
1266 static void incrementallyRehash(void) {
1267 int j;
1268
1269 for (j = 0; j < server.dbnum; j++) {
1270 if (dictIsRehashing(server.db[j].dict)) {
1271 dictRehashMilliseconds(server.db[j].dict,1);
1272 break; /* already used our millisecond for this loop... */
1273 }
1274 }
1275 }
1276
1277 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1278 void backgroundSaveDoneHandler(int statloc) {
1279 int exitcode = WEXITSTATUS(statloc);
1280 int bysignal = WIFSIGNALED(statloc);
1281
1282 if (!bysignal && exitcode == 0) {
1283 redisLog(REDIS_NOTICE,
1284 "Background saving terminated with success");
1285 server.dirty = 0;
1286 server.lastsave = time(NULL);
1287 } else if (!bysignal && exitcode != 0) {
1288 redisLog(REDIS_WARNING, "Background saving error");
1289 } else {
1290 redisLog(REDIS_WARNING,
1291 "Background saving terminated by signal %d", WTERMSIG(statloc));
1292 rdbRemoveTempFile(server.bgsavechildpid);
1293 }
1294 server.bgsavechildpid = -1;
1295 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1296 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1297 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1298 }
1299
1300 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1301 * Handle this. */
1302 void backgroundRewriteDoneHandler(int statloc) {
1303 int exitcode = WEXITSTATUS(statloc);
1304 int bysignal = WIFSIGNALED(statloc);
1305
1306 if (!bysignal && exitcode == 0) {
1307 int fd;
1308 char tmpfile[256];
1309
1310 redisLog(REDIS_NOTICE,
1311 "Background append only file rewriting terminated with success");
1312 /* Now it's time to flush the differences accumulated by the parent */
1313 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1314 fd = open(tmpfile,O_WRONLY|O_APPEND);
1315 if (fd == -1) {
1316 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1317 goto cleanup;
1318 }
1319 /* Flush our data... */
1320 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1321 (signed) sdslen(server.bgrewritebuf)) {
1322 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1323 close(fd);
1324 goto cleanup;
1325 }
1326 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1327 /* Now our work is to rename the temp file into the stable file. And
1328 * switch the file descriptor used by the server for append only. */
1329 if (rename(tmpfile,server.appendfilename) == -1) {
1330 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1331 close(fd);
1332 goto cleanup;
1333 }
1334 /* Mission completed... almost */
1335 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1336 if (server.appendfd != -1) {
1337 /* If append only is actually enabled... */
1338 close(server.appendfd);
1339 server.appendfd = fd;
1340 fsync(fd);
1341 server.appendseldb = -1; /* Make sure it will issue SELECT */
1342 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1343 } else {
1344 /* If append only is disabled we just generate a dump in this
1345 * format. Why not? */
1346 close(fd);
1347 }
1348 } else if (!bysignal && exitcode != 0) {
1349 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1350 } else {
1351 redisLog(REDIS_WARNING,
1352 "Background append only file rewriting terminated by signal %d",
1353 WTERMSIG(statloc));
1354 }
1355 cleanup:
1356 sdsfree(server.bgrewritebuf);
1357 server.bgrewritebuf = sdsempty();
1358 aofRemoveTempFile(server.bgrewritechildpid);
1359 server.bgrewritechildpid = -1;
1360 }
1361
1362 /* This function is called once a background process of some kind terminates,
1363 * as we want to avoid resizing the hash tables when there is a child in order
1364 * to play well with copy-on-write (otherwise when a resize happens lots of
1365 * memory pages are copied). The goal of this function is to update the ability
1366 * for dict.c to resize the hash tables accordingly to the fact we have o not
1367 * running childs. */
1368 static void updateDictResizePolicy(void) {
1369 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1370 dictEnableResize();
1371 else
1372 dictDisableResize();
1373 }
1374
1375 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1376 int j, loops = server.cronloops++;
1377 REDIS_NOTUSED(eventLoop);
1378 REDIS_NOTUSED(id);
1379 REDIS_NOTUSED(clientData);
1380
1381 /* We take a cached value of the unix time in the global state because
1382 * with virtual memory and aging there is to store the current time
1383 * in objects at every object access, and accuracy is not needed.
1384 * To access a global var is faster than calling time(NULL) */
1385 server.unixtime = time(NULL);
1386
1387 /* Show some info about non-empty databases */
1388 for (j = 0; j < server.dbnum; j++) {
1389 long long size, used, vkeys;
1390
1391 size = dictSlots(server.db[j].dict);
1392 used = dictSize(server.db[j].dict);
1393 vkeys = dictSize(server.db[j].expires);
1394 if (!(loops % 50) && (used || vkeys)) {
1395 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1396 /* dictPrintStats(server.dict); */
1397 }
1398 }
1399
1400 /* We don't want to resize the hash tables while a bacground saving
1401 * is in progress: the saving child is created using fork() that is
1402 * implemented with a copy-on-write semantic in most modern systems, so
1403 * if we resize the HT while there is the saving child at work actually
1404 * a lot of memory movements in the parent will cause a lot of pages
1405 * copied. */
1406 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1407 if (!(loops % 10)) tryResizeHashTables();
1408 if (server.activerehashing) incrementallyRehash();
1409 }
1410
1411 /* Show information about connected clients */
1412 if (!(loops % 50)) {
1413 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1414 listLength(server.clients)-listLength(server.slaves),
1415 listLength(server.slaves),
1416 zmalloc_used_memory());
1417 }
1418
1419 /* Close connections of timedout clients */
1420 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1421 closeTimedoutClients();
1422
1423 /* Check if a background saving or AOF rewrite in progress terminated */
1424 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1425 int statloc;
1426 pid_t pid;
1427
1428 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1429 if (pid == server.bgsavechildpid) {
1430 backgroundSaveDoneHandler(statloc);
1431 } else {
1432 backgroundRewriteDoneHandler(statloc);
1433 }
1434 updateDictResizePolicy();
1435 }
1436 } else {
1437 /* If there is not a background saving in progress check if
1438 * we have to save now */
1439 time_t now = time(NULL);
1440 for (j = 0; j < server.saveparamslen; j++) {
1441 struct saveparam *sp = server.saveparams+j;
1442
1443 if (server.dirty >= sp->changes &&
1444 now-server.lastsave > sp->seconds) {
1445 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1446 sp->changes, sp->seconds);
1447 rdbSaveBackground(server.dbfilename);
1448 break;
1449 }
1450 }
1451 }
1452
1453 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1454 * will use few CPU cycles if there are few expiring keys, otherwise
1455 * it will get more aggressive to avoid that too much memory is used by
1456 * keys that can be removed from the keyspace. */
1457 for (j = 0; j < server.dbnum; j++) {
1458 int expired;
1459 redisDb *db = server.db+j;
1460
1461 /* Continue to expire if at the end of the cycle more than 25%
1462 * of the keys were expired. */
1463 do {
1464 long num = dictSize(db->expires);
1465 time_t now = time(NULL);
1466
1467 expired = 0;
1468 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1469 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1470 while (num--) {
1471 dictEntry *de;
1472 time_t t;
1473
1474 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1475 t = (time_t) dictGetEntryVal(de);
1476 if (now > t) {
1477 deleteKey(db,dictGetEntryKey(de));
1478 expired++;
1479 server.stat_expiredkeys++;
1480 }
1481 }
1482 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1483 }
1484
1485 /* Swap a few keys on disk if we are over the memory limit and VM
1486 * is enbled. Try to free objects from the free list first. */
1487 if (vmCanSwapOut()) {
1488 while (server.vm_enabled && zmalloc_used_memory() >
1489 server.vm_max_memory)
1490 {
1491 int retval;
1492
1493 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1494 retval = (server.vm_max_threads == 0) ?
1495 vmSwapOneObjectBlocking() :
1496 vmSwapOneObjectThreaded();
1497 if (retval == REDIS_ERR && !(loops % 300) &&
1498 zmalloc_used_memory() >
1499 (server.vm_max_memory+server.vm_max_memory/10))
1500 {
1501 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1502 }
1503 /* Note that when using threade I/O we free just one object,
1504 * because anyway when the I/O thread in charge to swap this
1505 * object out will finish, the handler of completed jobs
1506 * will try to swap more objects if we are still out of memory. */
1507 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1508 }
1509 }
1510
1511 /* Check if we should connect to a MASTER */
1512 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1513 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1514 if (syncWithMaster() == REDIS_OK) {
1515 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1516 }
1517 }
1518 return 100;
1519 }
1520
1521 /* This function gets called every time Redis is entering the
1522 * main loop of the event driven library, that is, before to sleep
1523 * for ready file descriptors. */
1524 static void beforeSleep(struct aeEventLoop *eventLoop) {
1525 REDIS_NOTUSED(eventLoop);
1526
1527 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1528 listIter li;
1529 listNode *ln;
1530
1531 listRewind(server.io_ready_clients,&li);
1532 while((ln = listNext(&li))) {
1533 redisClient *c = ln->value;
1534 struct redisCommand *cmd;
1535
1536 /* Resume the client. */
1537 listDelNode(server.io_ready_clients,ln);
1538 c->flags &= (~REDIS_IO_WAIT);
1539 server.vm_blocked_clients--;
1540 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1541 readQueryFromClient, c);
1542 cmd = lookupCommand(c->argv[0]->ptr);
1543 assert(cmd != NULL);
1544 call(c,cmd);
1545 resetClient(c);
1546 /* There may be more data to process in the input buffer. */
1547 if (c->querybuf && sdslen(c->querybuf) > 0)
1548 processInputBuffer(c);
1549 }
1550 }
1551 }
1552
1553 static void createSharedObjects(void) {
1554 int j;
1555
1556 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1557 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1558 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1559 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1560 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1561 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1562 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1563 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1564 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1565 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1566 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1567 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1568 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1569 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1570 "-ERR no such key\r\n"));
1571 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1572 "-ERR syntax error\r\n"));
1573 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1574 "-ERR source and destination objects are the same\r\n"));
1575 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1576 "-ERR index out of range\r\n"));
1577 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1578 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1579 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1580 shared.select0 = createStringObject("select 0\r\n",10);
1581 shared.select1 = createStringObject("select 1\r\n",10);
1582 shared.select2 = createStringObject("select 2\r\n",10);
1583 shared.select3 = createStringObject("select 3\r\n",10);
1584 shared.select4 = createStringObject("select 4\r\n",10);
1585 shared.select5 = createStringObject("select 5\r\n",10);
1586 shared.select6 = createStringObject("select 6\r\n",10);
1587 shared.select7 = createStringObject("select 7\r\n",10);
1588 shared.select8 = createStringObject("select 8\r\n",10);
1589 shared.select9 = createStringObject("select 9\r\n",10);
1590 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1591 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1592 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1593 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1594 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1595 shared.mbulk3 = createStringObject("*3\r\n",4);
1596 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1597 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1598 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1599 }
1600 }
1601
1602 static void appendServerSaveParams(time_t seconds, int changes) {
1603 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1604 server.saveparams[server.saveparamslen].seconds = seconds;
1605 server.saveparams[server.saveparamslen].changes = changes;
1606 server.saveparamslen++;
1607 }
1608
1609 static void resetServerSaveParams() {
1610 zfree(server.saveparams);
1611 server.saveparams = NULL;
1612 server.saveparamslen = 0;
1613 }
1614
1615 static void initServerConfig() {
1616 server.dbnum = REDIS_DEFAULT_DBNUM;
1617 server.port = REDIS_SERVERPORT;
1618 server.verbosity = REDIS_VERBOSE;
1619 server.maxidletime = REDIS_MAXIDLETIME;
1620 server.saveparams = NULL;
1621 server.logfile = NULL; /* NULL = log on standard output */
1622 server.bindaddr = NULL;
1623 server.glueoutputbuf = 1;
1624 server.daemonize = 0;
1625 server.appendonly = 0;
1626 server.appendfsync = APPENDFSYNC_ALWAYS;
1627 server.lastfsync = time(NULL);
1628 server.appendfd = -1;
1629 server.appendseldb = -1; /* Make sure the first time will not match */
1630 server.pidfile = zstrdup("/var/run/redis.pid");
1631 server.dbfilename = zstrdup("dump.rdb");
1632 server.appendfilename = zstrdup("appendonly.aof");
1633 server.requirepass = NULL;
1634 server.rdbcompression = 1;
1635 server.activerehashing = 1;
1636 server.maxclients = 0;
1637 server.blpop_blocked_clients = 0;
1638 server.maxmemory = 0;
1639 server.vm_enabled = 0;
1640 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1641 server.vm_page_size = 256; /* 256 bytes per page */
1642 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1643 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1644 server.vm_max_threads = 4;
1645 server.vm_blocked_clients = 0;
1646 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1647 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1648
1649 resetServerSaveParams();
1650
1651 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1652 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1653 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1654 /* Replication related */
1655 server.isslave = 0;
1656 server.masterauth = NULL;
1657 server.masterhost = NULL;
1658 server.masterport = 6379;
1659 server.master = NULL;
1660 server.replstate = REDIS_REPL_NONE;
1661
1662 /* Double constants initialization */
1663 R_Zero = 0.0;
1664 R_PosInf = 1.0/R_Zero;
1665 R_NegInf = -1.0/R_Zero;
1666 R_Nan = R_Zero/R_Zero;
1667 }
1668
1669 static void initServer() {
1670 int j;
1671
1672 signal(SIGHUP, SIG_IGN);
1673 signal(SIGPIPE, SIG_IGN);
1674 setupSigSegvAction();
1675
1676 server.devnull = fopen("/dev/null","w");
1677 if (server.devnull == NULL) {
1678 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1679 exit(1);
1680 }
1681 server.clients = listCreate();
1682 server.slaves = listCreate();
1683 server.monitors = listCreate();
1684 server.objfreelist = listCreate();
1685 createSharedObjects();
1686 server.el = aeCreateEventLoop();
1687 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1688 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1689 if (server.fd == -1) {
1690 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1691 exit(1);
1692 }
1693 for (j = 0; j < server.dbnum; j++) {
1694 server.db[j].dict = dictCreate(&dbDictType,NULL);
1695 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1696 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1697 if (server.vm_enabled)
1698 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1699 server.db[j].id = j;
1700 }
1701 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1702 server.pubsub_patterns = listCreate();
1703 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1704 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1705 server.cronloops = 0;
1706 server.bgsavechildpid = -1;
1707 server.bgrewritechildpid = -1;
1708 server.bgrewritebuf = sdsempty();
1709 server.lastsave = time(NULL);
1710 server.dirty = 0;
1711 server.stat_numcommands = 0;
1712 server.stat_numconnections = 0;
1713 server.stat_expiredkeys = 0;
1714 server.stat_starttime = time(NULL);
1715 server.unixtime = time(NULL);
1716 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1717 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1718 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1719
1720 if (server.appendonly) {
1721 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1722 if (server.appendfd == -1) {
1723 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1724 strerror(errno));
1725 exit(1);
1726 }
1727 }
1728
1729 if (server.vm_enabled) vmInit();
1730 }
1731
1732 /* Empty the whole database */
1733 static long long emptyDb() {
1734 int j;
1735 long long removed = 0;
1736
1737 for (j = 0; j < server.dbnum; j++) {
1738 removed += dictSize(server.db[j].dict);
1739 dictEmpty(server.db[j].dict);
1740 dictEmpty(server.db[j].expires);
1741 }
1742 return removed;
1743 }
1744
1745 static int yesnotoi(char *s) {
1746 if (!strcasecmp(s,"yes")) return 1;
1747 else if (!strcasecmp(s,"no")) return 0;
1748 else return -1;
1749 }
1750
1751 /* I agree, this is a very rudimental way to load a configuration...
1752 will improve later if the config gets more complex */
1753 static void loadServerConfig(char *filename) {
1754 FILE *fp;
1755 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1756 int linenum = 0;
1757 sds line = NULL;
1758
1759 if (filename[0] == '-' && filename[1] == '\0')
1760 fp = stdin;
1761 else {
1762 if ((fp = fopen(filename,"r")) == NULL) {
1763 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1764 exit(1);
1765 }
1766 }
1767
1768 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1769 sds *argv;
1770 int argc, j;
1771
1772 linenum++;
1773 line = sdsnew(buf);
1774 line = sdstrim(line," \t\r\n");
1775
1776 /* Skip comments and blank lines*/
1777 if (line[0] == '#' || line[0] == '\0') {
1778 sdsfree(line);
1779 continue;
1780 }
1781
1782 /* Split into arguments */
1783 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1784 sdstolower(argv[0]);
1785
1786 /* Execute config directives */
1787 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1788 server.maxidletime = atoi(argv[1]);
1789 if (server.maxidletime < 0) {
1790 err = "Invalid timeout value"; goto loaderr;
1791 }
1792 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1793 server.port = atoi(argv[1]);
1794 if (server.port < 1 || server.port > 65535) {
1795 err = "Invalid port"; goto loaderr;
1796 }
1797 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1798 server.bindaddr = zstrdup(argv[1]);
1799 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1800 int seconds = atoi(argv[1]);
1801 int changes = atoi(argv[2]);
1802 if (seconds < 1 || changes < 0) {
1803 err = "Invalid save parameters"; goto loaderr;
1804 }
1805 appendServerSaveParams(seconds,changes);
1806 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1807 if (chdir(argv[1]) == -1) {
1808 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1809 argv[1], strerror(errno));
1810 exit(1);
1811 }
1812 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1813 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1814 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1815 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1816 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1817 else {
1818 err = "Invalid log level. Must be one of debug, notice, warning";
1819 goto loaderr;
1820 }
1821 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1822 FILE *logfp;
1823
1824 server.logfile = zstrdup(argv[1]);
1825 if (!strcasecmp(server.logfile,"stdout")) {
1826 zfree(server.logfile);
1827 server.logfile = NULL;
1828 }
1829 if (server.logfile) {
1830 /* Test if we are able to open the file. The server will not
1831 * be able to abort just for this problem later... */
1832 logfp = fopen(server.logfile,"a");
1833 if (logfp == NULL) {
1834 err = sdscatprintf(sdsempty(),
1835 "Can't open the log file: %s", strerror(errno));
1836 goto loaderr;
1837 }
1838 fclose(logfp);
1839 }
1840 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1841 server.dbnum = atoi(argv[1]);
1842 if (server.dbnum < 1) {
1843 err = "Invalid number of databases"; goto loaderr;
1844 }
1845 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1846 loadServerConfig(argv[1]);
1847 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1848 server.maxclients = atoi(argv[1]);
1849 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1850 server.maxmemory = memtoll(argv[1],NULL);
1851 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1852 server.masterhost = sdsnew(argv[1]);
1853 server.masterport = atoi(argv[2]);
1854 server.replstate = REDIS_REPL_CONNECT;
1855 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1856 server.masterauth = zstrdup(argv[1]);
1857 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1858 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1859 err = "argument must be 'yes' or 'no'"; goto loaderr;
1860 }
1861 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1862 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1863 err = "argument must be 'yes' or 'no'"; goto loaderr;
1864 }
1865 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1866 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1867 err = "argument must be 'yes' or 'no'"; goto loaderr;
1868 }
1869 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1870 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1871 err = "argument must be 'yes' or 'no'"; goto loaderr;
1872 }
1873 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1874 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1875 err = "argument must be 'yes' or 'no'"; goto loaderr;
1876 }
1877 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1878 if (!strcasecmp(argv[1],"no")) {
1879 server.appendfsync = APPENDFSYNC_NO;
1880 } else if (!strcasecmp(argv[1],"always")) {
1881 server.appendfsync = APPENDFSYNC_ALWAYS;
1882 } else if (!strcasecmp(argv[1],"everysec")) {
1883 server.appendfsync = APPENDFSYNC_EVERYSEC;
1884 } else {
1885 err = "argument must be 'no', 'always' or 'everysec'";
1886 goto loaderr;
1887 }
1888 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1889 server.requirepass = zstrdup(argv[1]);
1890 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1891 zfree(server.pidfile);
1892 server.pidfile = zstrdup(argv[1]);
1893 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1894 zfree(server.dbfilename);
1895 server.dbfilename = zstrdup(argv[1]);
1896 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1897 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1898 err = "argument must be 'yes' or 'no'"; goto loaderr;
1899 }
1900 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1901 zfree(server.vm_swap_file);
1902 server.vm_swap_file = zstrdup(argv[1]);
1903 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1904 server.vm_max_memory = memtoll(argv[1],NULL);
1905 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1906 server.vm_page_size = memtoll(argv[1], NULL);
1907 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1908 server.vm_pages = memtoll(argv[1], NULL);
1909 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1910 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1911 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1912 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
1913 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1914 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
1915 } else {
1916 err = "Bad directive or wrong number of arguments"; goto loaderr;
1917 }
1918 for (j = 0; j < argc; j++)
1919 sdsfree(argv[j]);
1920 zfree(argv);
1921 sdsfree(line);
1922 }
1923 if (fp != stdin) fclose(fp);
1924 return;
1925
1926 loaderr:
1927 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1928 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1929 fprintf(stderr, ">>> '%s'\n", line);
1930 fprintf(stderr, "%s\n", err);
1931 exit(1);
1932 }
1933
1934 static void freeClientArgv(redisClient *c) {
1935 int j;
1936
1937 for (j = 0; j < c->argc; j++)
1938 decrRefCount(c->argv[j]);
1939 for (j = 0; j < c->mbargc; j++)
1940 decrRefCount(c->mbargv[j]);
1941 c->argc = 0;
1942 c->mbargc = 0;
1943 }
1944
1945 static void freeClient(redisClient *c) {
1946 listNode *ln;
1947
1948 /* Note that if the client we are freeing is blocked into a blocking
1949 * call, we have to set querybuf to NULL *before* to call
1950 * unblockClientWaitingData() to avoid processInputBuffer() will get
1951 * called. Also it is important to remove the file events after
1952 * this, because this call adds the READABLE event. */
1953 sdsfree(c->querybuf);
1954 c->querybuf = NULL;
1955 if (c->flags & REDIS_BLOCKED)
1956 unblockClientWaitingData(c);
1957
1958 /* Unsubscribe from all the pubsub channels */
1959 pubsubUnsubscribeAllChannels(c,0);
1960 pubsubUnsubscribeAllPatterns(c,0);
1961 dictRelease(c->pubsub_channels);
1962 listRelease(c->pubsub_patterns);
1963 /* Obvious cleanup */
1964 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1965 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1966 listRelease(c->reply);
1967 freeClientArgv(c);
1968 close(c->fd);
1969 /* Remove from the list of clients */
1970 ln = listSearchKey(server.clients,c);
1971 redisAssert(ln != NULL);
1972 listDelNode(server.clients,ln);
1973 /* Remove from the list of clients waiting for swapped keys */
1974 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1975 ln = listSearchKey(server.io_ready_clients,c);
1976 if (ln) {
1977 listDelNode(server.io_ready_clients,ln);
1978 server.vm_blocked_clients--;
1979 }
1980 }
1981 while (server.vm_enabled && listLength(c->io_keys)) {
1982 ln = listFirst(c->io_keys);
1983 dontWaitForSwappedKey(c,ln->value);
1984 }
1985 listRelease(c->io_keys);
1986 /* Master/slave cleanup */
1987 if (c->flags & REDIS_SLAVE) {
1988 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1989 close(c->repldbfd);
1990 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1991 ln = listSearchKey(l,c);
1992 redisAssert(ln != NULL);
1993 listDelNode(l,ln);
1994 }
1995 if (c->flags & REDIS_MASTER) {
1996 server.master = NULL;
1997 server.replstate = REDIS_REPL_CONNECT;
1998 }
1999 /* Release memory */
2000 zfree(c->argv);
2001 zfree(c->mbargv);
2002 freeClientMultiState(c);
2003 zfree(c);
2004 }
2005
2006 #define GLUEREPLY_UP_TO (1024)
2007 static void glueReplyBuffersIfNeeded(redisClient *c) {
2008 int copylen = 0;
2009 char buf[GLUEREPLY_UP_TO];
2010 listNode *ln;
2011 listIter li;
2012 robj *o;
2013
2014 listRewind(c->reply,&li);
2015 while((ln = listNext(&li))) {
2016 int objlen;
2017
2018 o = ln->value;
2019 objlen = sdslen(o->ptr);
2020 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2021 memcpy(buf+copylen,o->ptr,objlen);
2022 copylen += objlen;
2023 listDelNode(c->reply,ln);
2024 } else {
2025 if (copylen == 0) return;
2026 break;
2027 }
2028 }
2029 /* Now the output buffer is empty, add the new single element */
2030 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2031 listAddNodeHead(c->reply,o);
2032 }
2033
2034 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2035 redisClient *c = privdata;
2036 int nwritten = 0, totwritten = 0, objlen;
2037 robj *o;
2038 REDIS_NOTUSED(el);
2039 REDIS_NOTUSED(mask);
2040
2041 /* Use writev() if we have enough buffers to send */
2042 if (!server.glueoutputbuf &&
2043 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2044 !(c->flags & REDIS_MASTER))
2045 {
2046 sendReplyToClientWritev(el, fd, privdata, mask);
2047 return;
2048 }
2049
2050 while(listLength(c->reply)) {
2051 if (server.glueoutputbuf && listLength(c->reply) > 1)
2052 glueReplyBuffersIfNeeded(c);
2053
2054 o = listNodeValue(listFirst(c->reply));
2055 objlen = sdslen(o->ptr);
2056
2057 if (objlen == 0) {
2058 listDelNode(c->reply,listFirst(c->reply));
2059 continue;
2060 }
2061
2062 if (c->flags & REDIS_MASTER) {
2063 /* Don't reply to a master */
2064 nwritten = objlen - c->sentlen;
2065 } else {
2066 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2067 if (nwritten <= 0) break;
2068 }
2069 c->sentlen += nwritten;
2070 totwritten += nwritten;
2071 /* If we fully sent the object on head go to the next one */
2072 if (c->sentlen == objlen) {
2073 listDelNode(c->reply,listFirst(c->reply));
2074 c->sentlen = 0;
2075 }
2076 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2077 * bytes, in a single threaded server it's a good idea to serve
2078 * other clients as well, even if a very large request comes from
2079 * super fast link that is always able to accept data (in real world
2080 * scenario think about 'KEYS *' against the loopback interfae) */
2081 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2082 }
2083 if (nwritten == -1) {
2084 if (errno == EAGAIN) {
2085 nwritten = 0;
2086 } else {
2087 redisLog(REDIS_VERBOSE,
2088 "Error writing to client: %s", strerror(errno));
2089 freeClient(c);
2090 return;
2091 }
2092 }
2093 if (totwritten > 0) c->lastinteraction = time(NULL);
2094 if (listLength(c->reply) == 0) {
2095 c->sentlen = 0;
2096 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2097 }
2098 }
2099
2100 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2101 {
2102 redisClient *c = privdata;
2103 int nwritten = 0, totwritten = 0, objlen, willwrite;
2104 robj *o;
2105 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2106 int offset, ion = 0;
2107 REDIS_NOTUSED(el);
2108 REDIS_NOTUSED(mask);
2109
2110 listNode *node;
2111 while (listLength(c->reply)) {
2112 offset = c->sentlen;
2113 ion = 0;
2114 willwrite = 0;
2115
2116 /* fill-in the iov[] array */
2117 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2118 o = listNodeValue(node);
2119 objlen = sdslen(o->ptr);
2120
2121 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2122 break;
2123
2124 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2125 break; /* no more iovecs */
2126
2127 iov[ion].iov_base = ((char*)o->ptr) + offset;
2128 iov[ion].iov_len = objlen - offset;
2129 willwrite += objlen - offset;
2130 offset = 0; /* just for the first item */
2131 ion++;
2132 }
2133
2134 if(willwrite == 0)
2135 break;
2136
2137 /* write all collected blocks at once */
2138 if((nwritten = writev(fd, iov, ion)) < 0) {
2139 if (errno != EAGAIN) {
2140 redisLog(REDIS_VERBOSE,
2141 "Error writing to client: %s", strerror(errno));
2142 freeClient(c);
2143 return;
2144 }
2145 break;
2146 }
2147
2148 totwritten += nwritten;
2149 offset = c->sentlen;
2150
2151 /* remove written robjs from c->reply */
2152 while (nwritten && listLength(c->reply)) {
2153 o = listNodeValue(listFirst(c->reply));
2154 objlen = sdslen(o->ptr);
2155
2156 if(nwritten >= objlen - offset) {
2157 listDelNode(c->reply, listFirst(c->reply));
2158 nwritten -= objlen - offset;
2159 c->sentlen = 0;
2160 } else {
2161 /* partial write */
2162 c->sentlen += nwritten;
2163 break;
2164 }
2165 offset = 0;
2166 }
2167 }
2168
2169 if (totwritten > 0)
2170 c->lastinteraction = time(NULL);
2171
2172 if (listLength(c->reply) == 0) {
2173 c->sentlen = 0;
2174 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2175 }
2176 }
2177
2178 static struct redisCommand *lookupCommand(char *name) {
2179 int j = 0;
2180 while(cmdTable[j].name != NULL) {
2181 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2182 j++;
2183 }
2184 return NULL;
2185 }
2186
2187 /* resetClient prepare the client to process the next command */
2188 static void resetClient(redisClient *c) {
2189 freeClientArgv(c);
2190 c->bulklen = -1;
2191 c->multibulk = 0;
2192 }
2193
2194 /* Call() is the core of Redis execution of a command */
2195 static void call(redisClient *c, struct redisCommand *cmd) {
2196 long long dirty;
2197
2198 dirty = server.dirty;
2199 cmd->proc(c);
2200 dirty = server.dirty-dirty;
2201
2202 if (server.appendonly && dirty)
2203 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2204 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2205 listLength(server.slaves))
2206 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2207 if (listLength(server.monitors))
2208 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2209 server.stat_numcommands++;
2210 }
2211
2212 /* If this function gets called we already read a whole
2213 * command, argments are in the client argv/argc fields.
2214 * processCommand() execute the command or prepare the
2215 * server for a bulk read from the client.
2216 *
2217 * If 1 is returned the client is still alive and valid and
2218 * and other operations can be performed by the caller. Otherwise
2219 * if 0 is returned the client was destroied (i.e. after QUIT). */
2220 static int processCommand(redisClient *c) {
2221 struct redisCommand *cmd;
2222
2223 /* Free some memory if needed (maxmemory setting) */
2224 if (server.maxmemory) freeMemoryIfNeeded();
2225
2226 /* Handle the multi bulk command type. This is an alternative protocol
2227 * supported by Redis in order to receive commands that are composed of
2228 * multiple binary-safe "bulk" arguments. The latency of processing is
2229 * a bit higher but this allows things like multi-sets, so if this
2230 * protocol is used only for MSET and similar commands this is a big win. */
2231 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2232 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2233 if (c->multibulk <= 0) {
2234 resetClient(c);
2235 return 1;
2236 } else {
2237 decrRefCount(c->argv[c->argc-1]);
2238 c->argc--;
2239 return 1;
2240 }
2241 } else if (c->multibulk) {
2242 if (c->bulklen == -1) {
2243 if (((char*)c->argv[0]->ptr)[0] != '$') {
2244 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2245 resetClient(c);
2246 return 1;
2247 } else {
2248 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2249 decrRefCount(c->argv[0]);
2250 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2251 c->argc--;
2252 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2253 resetClient(c);
2254 return 1;
2255 }
2256 c->argc--;
2257 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2258 return 1;
2259 }
2260 } else {
2261 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2262 c->mbargv[c->mbargc] = c->argv[0];
2263 c->mbargc++;
2264 c->argc--;
2265 c->multibulk--;
2266 if (c->multibulk == 0) {
2267 robj **auxargv;
2268 int auxargc;
2269
2270 /* Here we need to swap the multi-bulk argc/argv with the
2271 * normal argc/argv of the client structure. */
2272 auxargv = c->argv;
2273 c->argv = c->mbargv;
2274 c->mbargv = auxargv;
2275
2276 auxargc = c->argc;
2277 c->argc = c->mbargc;
2278 c->mbargc = auxargc;
2279
2280 /* We need to set bulklen to something different than -1
2281 * in order for the code below to process the command without
2282 * to try to read the last argument of a bulk command as
2283 * a special argument. */
2284 c->bulklen = 0;
2285 /* continue below and process the command */
2286 } else {
2287 c->bulklen = -1;
2288 return 1;
2289 }
2290 }
2291 }
2292 /* -- end of multi bulk commands processing -- */
2293
2294 /* The QUIT command is handled as a special case. Normal command
2295 * procs are unable to close the client connection safely */
2296 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2297 freeClient(c);
2298 return 0;
2299 }
2300
2301 /* Now lookup the command and check ASAP about trivial error conditions
2302 * such wrong arity, bad command name and so forth. */
2303 cmd = lookupCommand(c->argv[0]->ptr);
2304 if (!cmd) {
2305 addReplySds(c,
2306 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2307 (char*)c->argv[0]->ptr));
2308 resetClient(c);
2309 return 1;
2310 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2311 (c->argc < -cmd->arity)) {
2312 addReplySds(c,
2313 sdscatprintf(sdsempty(),
2314 "-ERR wrong number of arguments for '%s' command\r\n",
2315 cmd->name));
2316 resetClient(c);
2317 return 1;
2318 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2319 /* This is a bulk command, we have to read the last argument yet. */
2320 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2321
2322 decrRefCount(c->argv[c->argc-1]);
2323 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2324 c->argc--;
2325 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2326 resetClient(c);
2327 return 1;
2328 }
2329 c->argc--;
2330 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2331 /* It is possible that the bulk read is already in the
2332 * buffer. Check this condition and handle it accordingly.
2333 * This is just a fast path, alternative to call processInputBuffer().
2334 * It's a good idea since the code is small and this condition
2335 * happens most of the times. */
2336 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2337 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2338 c->argc++;
2339 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2340 } else {
2341 /* Otherwise return... there is to read the last argument
2342 * from the socket. */
2343 return 1;
2344 }
2345 }
2346 /* Let's try to encode the bulk object to save space. */
2347 if (cmd->flags & REDIS_CMD_BULK)
2348 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2349
2350 /* Check if the user is authenticated */
2351 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2352 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2353 resetClient(c);
2354 return 1;
2355 }
2356
2357 /* Handle the maxmemory directive */
2358 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2359 zmalloc_used_memory() > server.maxmemory)
2360 {
2361 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2362 resetClient(c);
2363 return 1;
2364 }
2365
2366 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2367 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2368 &&
2369 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2370 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2371 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2372 resetClient(c);
2373 return 1;
2374 }
2375
2376 /* Exec the command */
2377 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2378 queueMultiCommand(c,cmd);
2379 addReply(c,shared.queued);
2380 } else {
2381 if (server.vm_enabled && server.vm_max_threads > 0 &&
2382 blockClientOnSwappedKeys(cmd,c)) return 1;
2383 call(c,cmd);
2384 }
2385
2386 /* Prepare the client for the next command */
2387 resetClient(c);
2388 return 1;
2389 }
2390
2391 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2392 listNode *ln;
2393 listIter li;
2394 int outc = 0, j;
2395 robj **outv;
2396 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2397 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2398 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2399 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2400 robj *lenobj;
2401
2402 if (argc <= REDIS_STATIC_ARGS) {
2403 outv = static_outv;
2404 } else {
2405 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2406 }
2407
2408 lenobj = createObject(REDIS_STRING,
2409 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2410 lenobj->refcount = 0;
2411 outv[outc++] = lenobj;
2412 for (j = 0; j < argc; j++) {
2413 lenobj = createObject(REDIS_STRING,
2414 sdscatprintf(sdsempty(),"$%lu\r\n",
2415 (unsigned long) stringObjectLen(argv[j])));
2416 lenobj->refcount = 0;
2417 outv[outc++] = lenobj;
2418 outv[outc++] = argv[j];
2419 outv[outc++] = shared.crlf;
2420 }
2421
2422 /* Increment all the refcounts at start and decrement at end in order to
2423 * be sure to free objects if there is no slave in a replication state
2424 * able to be feed with commands */
2425 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2426 listRewind(slaves,&li);
2427 while((ln = listNext(&li))) {
2428 redisClient *slave = ln->value;
2429
2430 /* Don't feed slaves that are still waiting for BGSAVE to start */
2431 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2432
2433 /* Feed all the other slaves, MONITORs and so on */
2434 if (slave->slaveseldb != dictid) {
2435 robj *selectcmd;
2436
2437 switch(dictid) {
2438 case 0: selectcmd = shared.select0; break;
2439 case 1: selectcmd = shared.select1; break;
2440 case 2: selectcmd = shared.select2; break;
2441 case 3: selectcmd = shared.select3; break;
2442 case 4: selectcmd = shared.select4; break;
2443 case 5: selectcmd = shared.select5; break;
2444 case 6: selectcmd = shared.select6; break;
2445 case 7: selectcmd = shared.select7; break;
2446 case 8: selectcmd = shared.select8; break;
2447 case 9: selectcmd = shared.select9; break;
2448 default:
2449 selectcmd = createObject(REDIS_STRING,
2450 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2451 selectcmd->refcount = 0;
2452 break;
2453 }
2454 addReply(slave,selectcmd);
2455 slave->slaveseldb = dictid;
2456 }
2457 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2458 }
2459 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2460 if (outv != static_outv) zfree(outv);
2461 }
2462
2463 static void processInputBuffer(redisClient *c) {
2464 again:
2465 /* Before to process the input buffer, make sure the client is not
2466 * waitig for a blocking operation such as BLPOP. Note that the first
2467 * iteration the client is never blocked, otherwise the processInputBuffer
2468 * would not be called at all, but after the execution of the first commands
2469 * in the input buffer the client may be blocked, and the "goto again"
2470 * will try to reiterate. The following line will make it return asap. */
2471 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2472 if (c->bulklen == -1) {
2473 /* Read the first line of the query */
2474 char *p = strchr(c->querybuf,'\n');
2475 size_t querylen;
2476
2477 if (p) {
2478 sds query, *argv;
2479 int argc, j;
2480
2481 query = c->querybuf;
2482 c->querybuf = sdsempty();
2483 querylen = 1+(p-(query));
2484 if (sdslen(query) > querylen) {
2485 /* leave data after the first line of the query in the buffer */
2486 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2487 }
2488 *p = '\0'; /* remove "\n" */
2489 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2490 sdsupdatelen(query);
2491
2492 /* Now we can split the query in arguments */
2493 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2494 sdsfree(query);
2495
2496 if (c->argv) zfree(c->argv);
2497 c->argv = zmalloc(sizeof(robj*)*argc);
2498
2499 for (j = 0; j < argc; j++) {
2500 if (sdslen(argv[j])) {
2501 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2502 c->argc++;
2503 } else {
2504 sdsfree(argv[j]);
2505 }
2506 }
2507 zfree(argv);
2508 if (c->argc) {
2509 /* Execute the command. If the client is still valid
2510 * after processCommand() return and there is something
2511 * on the query buffer try to process the next command. */
2512 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2513 } else {
2514 /* Nothing to process, argc == 0. Just process the query
2515 * buffer if it's not empty or return to the caller */
2516 if (sdslen(c->querybuf)) goto again;
2517 }
2518 return;
2519 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2520 redisLog(REDIS_VERBOSE, "Client protocol error");
2521 freeClient(c);
2522 return;
2523 }
2524 } else {
2525 /* Bulk read handling. Note that if we are at this point
2526 the client already sent a command terminated with a newline,
2527 we are reading the bulk data that is actually the last
2528 argument of the command. */
2529 int qbl = sdslen(c->querybuf);
2530
2531 if (c->bulklen <= qbl) {
2532 /* Copy everything but the final CRLF as final argument */
2533 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2534 c->argc++;
2535 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2536 /* Process the command. If the client is still valid after
2537 * the processing and there is more data in the buffer
2538 * try to parse it. */
2539 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2540 return;
2541 }
2542 }
2543 }
2544
2545 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2546 redisClient *c = (redisClient*) privdata;
2547 char buf[REDIS_IOBUF_LEN];
2548 int nread;
2549 REDIS_NOTUSED(el);
2550 REDIS_NOTUSED(mask);
2551
2552 nread = read(fd, buf, REDIS_IOBUF_LEN);
2553 if (nread == -1) {
2554 if (errno == EAGAIN) {
2555 nread = 0;
2556 } else {
2557 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2558 freeClient(c);
2559 return;
2560 }
2561 } else if (nread == 0) {
2562 redisLog(REDIS_VERBOSE, "Client closed connection");
2563 freeClient(c);
2564 return;
2565 }
2566 if (nread) {
2567 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2568 c->lastinteraction = time(NULL);
2569 } else {
2570 return;
2571 }
2572 processInputBuffer(c);
2573 }
2574
2575 static int selectDb(redisClient *c, int id) {
2576 if (id < 0 || id >= server.dbnum)
2577 return REDIS_ERR;
2578 c->db = &server.db[id];
2579 return REDIS_OK;
2580 }
2581
2582 static void *dupClientReplyValue(void *o) {
2583 incrRefCount((robj*)o);
2584 return o;
2585 }
2586
2587 static int listMatchObjects(void *a, void *b) {
2588 return compareStringObjects(a,b) == 0;
2589 }
2590
2591 static redisClient *createClient(int fd) {
2592 redisClient *c = zmalloc(sizeof(*c));
2593
2594 anetNonBlock(NULL,fd);
2595 anetTcpNoDelay(NULL,fd);
2596 if (!c) return NULL;
2597 selectDb(c,0);
2598 c->fd = fd;
2599 c->querybuf = sdsempty();
2600 c->argc = 0;
2601 c->argv = NULL;
2602 c->bulklen = -1;
2603 c->multibulk = 0;
2604 c->mbargc = 0;
2605 c->mbargv = NULL;
2606 c->sentlen = 0;
2607 c->flags = 0;
2608 c->lastinteraction = time(NULL);
2609 c->authenticated = 0;
2610 c->replstate = REDIS_REPL_NONE;
2611 c->reply = listCreate();
2612 listSetFreeMethod(c->reply,decrRefCount);
2613 listSetDupMethod(c->reply,dupClientReplyValue);
2614 c->blockingkeys = NULL;
2615 c->blockingkeysnum = 0;
2616 c->io_keys = listCreate();
2617 listSetFreeMethod(c->io_keys,decrRefCount);
2618 c->pubsub_channels = dictCreate(&setDictType,NULL);
2619 c->pubsub_patterns = listCreate();
2620 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2621 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2622 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2623 readQueryFromClient, c) == AE_ERR) {
2624 freeClient(c);
2625 return NULL;
2626 }
2627 listAddNodeTail(server.clients,c);
2628 initClientMultiState(c);
2629 return c;
2630 }
2631
2632 static void addReply(redisClient *c, robj *obj) {
2633 if (listLength(c->reply) == 0 &&
2634 (c->replstate == REDIS_REPL_NONE ||
2635 c->replstate == REDIS_REPL_ONLINE) &&
2636 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2637 sendReplyToClient, c) == AE_ERR) return;
2638
2639 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2640 obj = dupStringObject(obj);
2641 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2642 }
2643 listAddNodeTail(c->reply,getDecodedObject(obj));
2644 }
2645
2646 static void addReplySds(redisClient *c, sds s) {
2647 robj *o = createObject(REDIS_STRING,s);
2648 addReply(c,o);
2649 decrRefCount(o);
2650 }
2651
2652 static void addReplyDouble(redisClient *c, double d) {
2653 char buf[128];
2654
2655 snprintf(buf,sizeof(buf),"%.17g",d);
2656 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2657 (unsigned long) strlen(buf),buf));
2658 }
2659
2660 static void addReplyLong(redisClient *c, long l) {
2661 char buf[128];
2662 size_t len;
2663
2664 if (l == 0) {
2665 addReply(c,shared.czero);
2666 return;
2667 } else if (l == 1) {
2668 addReply(c,shared.cone);
2669 return;
2670 }
2671 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2672 addReplySds(c,sdsnewlen(buf,len));
2673 }
2674
2675 static void addReplyLongLong(redisClient *c, long long ll) {
2676 char buf[128];
2677 size_t len;
2678
2679 if (ll == 0) {
2680 addReply(c,shared.czero);
2681 return;
2682 } else if (ll == 1) {
2683 addReply(c,shared.cone);
2684 return;
2685 }
2686 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2687 addReplySds(c,sdsnewlen(buf,len));
2688 }
2689
2690 static void addReplyUlong(redisClient *c, unsigned long ul) {
2691 char buf[128];
2692 size_t len;
2693
2694 if (ul == 0) {
2695 addReply(c,shared.czero);
2696 return;
2697 } else if (ul == 1) {
2698 addReply(c,shared.cone);
2699 return;
2700 }
2701 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2702 addReplySds(c,sdsnewlen(buf,len));
2703 }
2704
2705 static void addReplyBulkLen(redisClient *c, robj *obj) {
2706 size_t len;
2707
2708 if (obj->encoding == REDIS_ENCODING_RAW) {
2709 len = sdslen(obj->ptr);
2710 } else {
2711 long n = (long)obj->ptr;
2712
2713 /* Compute how many bytes will take this integer as a radix 10 string */
2714 len = 1;
2715 if (n < 0) {
2716 len++;
2717 n = -n;
2718 }
2719 while((n = n/10) != 0) {
2720 len++;
2721 }
2722 }
2723 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2724 }
2725
2726 static void addReplyBulk(redisClient *c, robj *obj) {
2727 addReplyBulkLen(c,obj);
2728 addReply(c,obj);
2729 addReply(c,shared.crlf);
2730 }
2731
2732 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2733 static void addReplyBulkCString(redisClient *c, char *s) {
2734 if (s == NULL) {
2735 addReply(c,shared.nullbulk);
2736 } else {
2737 robj *o = createStringObject(s,strlen(s));
2738 addReplyBulk(c,o);
2739 decrRefCount(o);
2740 }
2741 }
2742
2743 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2744 int cport, cfd;
2745 char cip[128];
2746 redisClient *c;
2747 REDIS_NOTUSED(el);
2748 REDIS_NOTUSED(mask);
2749 REDIS_NOTUSED(privdata);
2750
2751 cfd = anetAccept(server.neterr, fd, cip, &cport);
2752 if (cfd == AE_ERR) {
2753 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2754 return;
2755 }
2756 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2757 if ((c = createClient(cfd)) == NULL) {
2758 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2759 close(cfd); /* May be already closed, just ingore errors */
2760 return;
2761 }
2762 /* If maxclient directive is set and this is one client more... close the
2763 * connection. Note that we create the client instead to check before
2764 * for this condition, since now the socket is already set in nonblocking
2765 * mode and we can send an error for free using the Kernel I/O */
2766 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2767 char *err = "-ERR max number of clients reached\r\n";
2768
2769 /* That's a best effort error message, don't check write errors */
2770 if (write(c->fd,err,strlen(err)) == -1) {
2771 /* Nothing to do, Just to avoid the warning... */
2772 }
2773 freeClient(c);
2774 return;
2775 }
2776 server.stat_numconnections++;
2777 }
2778
2779 /* ======================= Redis objects implementation ===================== */
2780
2781 static robj *createObject(int type, void *ptr) {
2782 robj *o;
2783
2784 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2785 if (listLength(server.objfreelist)) {
2786 listNode *head = listFirst(server.objfreelist);
2787 o = listNodeValue(head);
2788 listDelNode(server.objfreelist,head);
2789 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2790 } else {
2791 if (server.vm_enabled) {
2792 pthread_mutex_unlock(&server.obj_freelist_mutex);
2793 o = zmalloc(sizeof(*o));
2794 } else {
2795 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2796 }
2797 }
2798 o->type = type;
2799 o->encoding = REDIS_ENCODING_RAW;
2800 o->ptr = ptr;
2801 o->refcount = 1;
2802 if (server.vm_enabled) {
2803 /* Note that this code may run in the context of an I/O thread
2804 * and accessing to server.unixtime in theory is an error
2805 * (no locks). But in practice this is safe, and even if we read
2806 * garbage Redis will not fail, as it's just a statistical info */
2807 o->vm.atime = server.unixtime;
2808 o->storage = REDIS_VM_MEMORY;
2809 }
2810 return o;
2811 }
2812
2813 static robj *createStringObject(char *ptr, size_t len) {
2814 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2815 }
2816
2817 static robj *createStringObjectFromLongLong(long long value) {
2818 robj *o;
2819 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2820 incrRefCount(shared.integers[value]);
2821 o = shared.integers[value];
2822 } else {
2823 o = createObject(REDIS_STRING, NULL);
2824 if (value >= LONG_MIN && value <= LONG_MAX) {
2825 o->encoding = REDIS_ENCODING_INT;
2826 o->ptr = (void*)((long)value);
2827 } else {
2828 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2829 }
2830 }
2831 return o;
2832 }
2833
2834 static robj *dupStringObject(robj *o) {
2835 assert(o->encoding == REDIS_ENCODING_RAW);
2836 return createStringObject(o->ptr,sdslen(o->ptr));
2837 }
2838
2839 static robj *createListObject(void) {
2840 list *l = listCreate();
2841
2842 listSetFreeMethod(l,decrRefCount);
2843 return createObject(REDIS_LIST,l);
2844 }
2845
2846 static robj *createSetObject(void) {
2847 dict *d = dictCreate(&setDictType,NULL);
2848 return createObject(REDIS_SET,d);
2849 }
2850
2851 static robj *createHashObject(void) {
2852 /* All the Hashes start as zipmaps. Will be automatically converted
2853 * into hash tables if there are enough elements or big elements
2854 * inside. */
2855 unsigned char *zm = zipmapNew();
2856 robj *o = createObject(REDIS_HASH,zm);
2857 o->encoding = REDIS_ENCODING_ZIPMAP;
2858 return o;
2859 }
2860
2861 static robj *createZsetObject(void) {
2862 zset *zs = zmalloc(sizeof(*zs));
2863
2864 zs->dict = dictCreate(&zsetDictType,NULL);
2865 zs->zsl = zslCreate();
2866 return createObject(REDIS_ZSET,zs);
2867 }
2868
2869 static void freeStringObject(robj *o) {
2870 if (o->encoding == REDIS_ENCODING_RAW) {
2871 sdsfree(o->ptr);
2872 }
2873 }
2874
2875 static void freeListObject(robj *o) {
2876 listRelease((list*) o->ptr);
2877 }
2878
2879 static void freeSetObject(robj *o) {
2880 dictRelease((dict*) o->ptr);
2881 }
2882
2883 static void freeZsetObject(robj *o) {
2884 zset *zs = o->ptr;
2885
2886 dictRelease(zs->dict);
2887 zslFree(zs->zsl);
2888 zfree(zs);
2889 }
2890
2891 static void freeHashObject(robj *o) {
2892 switch (o->encoding) {
2893 case REDIS_ENCODING_HT:
2894 dictRelease((dict*) o->ptr);
2895 break;
2896 case REDIS_ENCODING_ZIPMAP:
2897 zfree(o->ptr);
2898 break;
2899 default:
2900 redisPanic("Unknown hash encoding type");
2901 break;
2902 }
2903 }
2904
2905 static void incrRefCount(robj *o) {
2906 o->refcount++;
2907 }
2908
2909 static void decrRefCount(void *obj) {
2910 robj *o = obj;
2911
2912 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
2913 /* Object is a key of a swapped out value, or in the process of being
2914 * loaded. */
2915 if (server.vm_enabled &&
2916 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2917 {
2918 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2919 redisAssert(o->type == REDIS_STRING);
2920 freeStringObject(o);
2921 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2922 pthread_mutex_lock(&server.obj_freelist_mutex);
2923 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2924 !listAddNodeHead(server.objfreelist,o))
2925 zfree(o);
2926 pthread_mutex_unlock(&server.obj_freelist_mutex);
2927 server.vm_stats_swapped_objects--;
2928 return;
2929 }
2930 /* Object is in memory, or in the process of being swapped out. */
2931 if (--(o->refcount) == 0) {
2932 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2933 vmCancelThreadedIOJob(obj);
2934 switch(o->type) {
2935 case REDIS_STRING: freeStringObject(o); break;
2936 case REDIS_LIST: freeListObject(o); break;
2937 case REDIS_SET: freeSetObject(o); break;
2938 case REDIS_ZSET: freeZsetObject(o); break;
2939 case REDIS_HASH: freeHashObject(o); break;
2940 default: redisPanic("Unknown object type"); break;
2941 }
2942 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2943 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2944 !listAddNodeHead(server.objfreelist,o))
2945 zfree(o);
2946 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2947 }
2948 }
2949
2950 static robj *lookupKey(redisDb *db, robj *key) {
2951 dictEntry *de = dictFind(db->dict,key);
2952 if (de) {
2953 robj *key = dictGetEntryKey(de);
2954 robj *val = dictGetEntryVal(de);
2955
2956 if (server.vm_enabled) {
2957 if (key->storage == REDIS_VM_MEMORY ||
2958 key->storage == REDIS_VM_SWAPPING)
2959 {
2960 /* If we were swapping the object out, stop it, this key
2961 * was requested. */
2962 if (key->storage == REDIS_VM_SWAPPING)
2963 vmCancelThreadedIOJob(key);
2964 /* Update the access time of the key for the aging algorithm. */
2965 key->vm.atime = server.unixtime;
2966 } else {
2967 int notify = (key->storage == REDIS_VM_LOADING);
2968
2969 /* Our value was swapped on disk. Bring it at home. */
2970 redisAssert(val == NULL);
2971 val = vmLoadObject(key);
2972 dictGetEntryVal(de) = val;
2973
2974 /* Clients blocked by the VM subsystem may be waiting for
2975 * this key... */
2976 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2977 }
2978 }
2979 return val;
2980 } else {
2981 return NULL;
2982 }
2983 }
2984
2985 static robj *lookupKeyRead(redisDb *db, robj *key) {
2986 expireIfNeeded(db,key);
2987 return lookupKey(db,key);
2988 }
2989
2990 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2991 deleteIfVolatile(db,key);
2992 return lookupKey(db,key);
2993 }
2994
2995 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2996 robj *o = lookupKeyRead(c->db, key);
2997 if (!o) addReply(c,reply);
2998 return o;
2999 }
3000
3001 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3002 robj *o = lookupKeyWrite(c->db, key);
3003 if (!o) addReply(c,reply);
3004 return o;
3005 }
3006
3007 static int checkType(redisClient *c, robj *o, int type) {
3008 if (o->type != type) {
3009 addReply(c,shared.wrongtypeerr);
3010 return 1;
3011 }
3012 return 0;
3013 }
3014
3015 static int deleteKey(redisDb *db, robj *key) {
3016 int retval;
3017
3018 /* We need to protect key from destruction: after the first dictDelete()
3019 * it may happen that 'key' is no longer valid if we don't increment
3020 * it's count. This may happen when we get the object reference directly
3021 * from the hash table with dictRandomKey() or dict iterators */
3022 incrRefCount(key);
3023 if (dictSize(db->expires)) dictDelete(db->expires,key);
3024 retval = dictDelete(db->dict,key);
3025 decrRefCount(key);
3026
3027 return retval == DICT_OK;
3028 }
3029
3030 /* Check if the nul-terminated string 's' can be represented by a long
3031 * (that is, is a number that fits into long without any other space or
3032 * character before or after the digits).
3033 *
3034 * If so, the function returns REDIS_OK and *longval is set to the value
3035 * of the number. Otherwise REDIS_ERR is returned */
3036 static int isStringRepresentableAsLong(sds s, long *longval) {
3037 char buf[32], *endptr;
3038 long value;
3039 int slen;
3040
3041 value = strtol(s, &endptr, 10);
3042 if (endptr[0] != '\0') return REDIS_ERR;
3043 slen = snprintf(buf,32,"%ld",value);
3044
3045 /* If the number converted back into a string is not identical
3046 * then it's not possible to encode the string as integer */
3047 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3048 if (longval) *longval = value;
3049 return REDIS_OK;
3050 }
3051
3052 /* Try to encode a string object in order to save space */
3053 static robj *tryObjectEncoding(robj *o) {
3054 long value;
3055 sds s = o->ptr;
3056
3057 if (o->encoding != REDIS_ENCODING_RAW)
3058 return o; /* Already encoded */
3059
3060 /* It's not safe to encode shared objects: shared objects can be shared
3061 * everywhere in the "object space" of Redis. Encoded objects can only
3062 * appear as "values" (and not, for instance, as keys) */
3063 if (o->refcount > 1) return o;
3064
3065 /* Currently we try to encode only strings */
3066 redisAssert(o->type == REDIS_STRING);
3067
3068 /* Check if we can represent this string as a long integer */
3069 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3070
3071 /* Ok, this object can be encoded */
3072 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3073 decrRefCount(o);
3074 incrRefCount(shared.integers[value]);
3075 return shared.integers[value];
3076 } else {
3077 o->encoding = REDIS_ENCODING_INT;
3078 sdsfree(o->ptr);
3079 o->ptr = (void*) value;
3080 return o;
3081 }
3082 }
3083
3084 /* Get a decoded version of an encoded object (returned as a new object).
3085 * If the object is already raw-encoded just increment the ref count. */
3086 static robj *getDecodedObject(robj *o) {
3087 robj *dec;
3088
3089 if (o->encoding == REDIS_ENCODING_RAW) {
3090 incrRefCount(o);
3091 return o;
3092 }
3093 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3094 char buf[32];
3095
3096 snprintf(buf,32,"%ld",(long)o->ptr);
3097 dec = createStringObject(buf,strlen(buf));
3098 return dec;
3099 } else {
3100 redisPanic("Unknown encoding type");
3101 }
3102 }
3103
3104 /* Compare two string objects via strcmp() or alike.
3105 * Note that the objects may be integer-encoded. In such a case we
3106 * use snprintf() to get a string representation of the numbers on the stack
3107 * and compare the strings, it's much faster than calling getDecodedObject().
3108 *
3109 * Important note: if objects are not integer encoded, but binary-safe strings,
3110 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3111 * binary safe. */
3112 static int compareStringObjects(robj *a, robj *b) {
3113 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3114 char bufa[128], bufb[128], *astr, *bstr;
3115 int bothsds = 1;
3116
3117 if (a == b) return 0;
3118 if (a->encoding != REDIS_ENCODING_RAW) {
3119 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3120 astr = bufa;
3121 bothsds = 0;
3122 } else {
3123 astr = a->ptr;
3124 }
3125 if (b->encoding != REDIS_ENCODING_RAW) {
3126 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3127 bstr = bufb;
3128 bothsds = 0;
3129 } else {
3130 bstr = b->ptr;
3131 }
3132 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3133 }
3134
3135 static size_t stringObjectLen(robj *o) {
3136 redisAssert(o->type == REDIS_STRING);
3137 if (o->encoding == REDIS_ENCODING_RAW) {
3138 return sdslen(o->ptr);
3139 } else {
3140 char buf[32];
3141
3142 return snprintf(buf,32,"%ld",(long)o->ptr);
3143 }
3144 }
3145
3146 static int getDoubleFromObject(robj *o, double *target) {
3147 double value;
3148 char *eptr;
3149
3150 if (o == NULL) {
3151 value = 0;
3152 } else {
3153 redisAssert(o->type == REDIS_STRING);
3154 if (o->encoding == REDIS_ENCODING_RAW) {
3155 value = strtod(o->ptr, &eptr);
3156 if (eptr[0] != '\0') return REDIS_ERR;
3157 } else if (o->encoding == REDIS_ENCODING_INT) {
3158 value = (long)o->ptr;
3159 } else {
3160 redisAssert(1 != 1);
3161 }
3162 }
3163
3164 *target = value;
3165 return REDIS_OK;
3166 }
3167
3168 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3169 double value;
3170 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3171 if (msg != NULL) {
3172 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3173 } else {
3174 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3175 }
3176 return REDIS_ERR;
3177 }
3178
3179 *target = value;
3180 return REDIS_OK;
3181 }
3182
3183 static int getLongLongFromObject(robj *o, long long *target) {
3184 long long value;
3185 char *eptr;
3186
3187 if (o == NULL) {
3188 value = 0;
3189 } else {
3190 redisAssert(o->type == REDIS_STRING);
3191 if (o->encoding == REDIS_ENCODING_RAW) {
3192 value = strtoll(o->ptr, &eptr, 10);
3193 if (eptr[0] != '\0') return REDIS_ERR;
3194 } else if (o->encoding == REDIS_ENCODING_INT) {
3195 value = (long)o->ptr;
3196 } else {
3197 redisAssert(1 != 1);
3198 }
3199 }
3200
3201 *target = value;
3202 return REDIS_OK;
3203 }
3204
3205 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3206 long long value;
3207 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3208 if (msg != NULL) {
3209 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3210 } else {
3211 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3212 }
3213 return REDIS_ERR;
3214 }
3215
3216 *target = value;
3217 return REDIS_OK;
3218 }
3219
3220 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3221 long long value;
3222
3223 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3224 if (value < LONG_MIN || value > LONG_MAX) {
3225 if (msg != NULL) {
3226 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3227 } else {
3228 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3229 }
3230 return REDIS_ERR;
3231 }
3232
3233 *target = value;
3234 return REDIS_OK;
3235 }
3236
3237 /*============================ RDB saving/loading =========================== */
3238
3239 static int rdbSaveType(FILE *fp, unsigned char type) {
3240 if (fwrite(&type,1,1,fp) == 0) return -1;
3241 return 0;
3242 }
3243
3244 static int rdbSaveTime(FILE *fp, time_t t) {
3245 int32_t t32 = (int32_t) t;
3246 if (fwrite(&t32,4,1,fp) == 0) return -1;
3247 return 0;
3248 }
3249
3250 /* check rdbLoadLen() comments for more info */
3251 static int rdbSaveLen(FILE *fp, uint32_t len) {
3252 unsigned char buf[2];
3253
3254 if (len < (1<<6)) {
3255 /* Save a 6 bit len */
3256 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3257 if (fwrite(buf,1,1,fp) == 0) return -1;
3258 } else if (len < (1<<14)) {
3259 /* Save a 14 bit len */
3260 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3261 buf[1] = len&0xFF;
3262 if (fwrite(buf,2,1,fp) == 0) return -1;
3263 } else {
3264 /* Save a 32 bit len */
3265 buf[0] = (REDIS_RDB_32BITLEN<<6);
3266 if (fwrite(buf,1,1,fp) == 0) return -1;
3267 len = htonl(len);
3268 if (fwrite(&len,4,1,fp) == 0) return -1;
3269 }
3270 return 0;
3271 }
3272
3273 /* String objects in the form "2391" "-100" without any space and with a
3274 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3275 * encoded as integers to save space */
3276 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3277 long long value;
3278 char *endptr, buf[32];
3279
3280 /* Check if it's possible to encode this value as a number */
3281 value = strtoll(s, &endptr, 10);
3282 if (endptr[0] != '\0') return 0;
3283 snprintf(buf,32,"%lld",value);
3284
3285 /* If the number converted back into a string is not identical
3286 * then it's not possible to encode the string as integer */
3287 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3288
3289 /* Finally check if it fits in our ranges */
3290 if (value >= -(1<<7) && value <= (1<<7)-1) {
3291 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3292 enc[1] = value&0xFF;
3293 return 2;
3294 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3295 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3296 enc[1] = value&0xFF;
3297 enc[2] = (value>>8)&0xFF;
3298 return 3;
3299 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3300 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3301 enc[1] = value&0xFF;
3302 enc[2] = (value>>8)&0xFF;
3303 enc[3] = (value>>16)&0xFF;
3304 enc[4] = (value>>24)&0xFF;
3305 return 5;
3306 } else {
3307 return 0;
3308 }
3309 }
3310
3311 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3312 size_t comprlen, outlen;
3313 unsigned char byte;
3314 void *out;
3315
3316 /* We require at least four bytes compression for this to be worth it */
3317 if (len <= 4) return 0;
3318 outlen = len-4;
3319 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3320 comprlen = lzf_compress(s, len, out, outlen);
3321 if (comprlen == 0) {
3322 zfree(out);
3323 return 0;
3324 }
3325 /* Data compressed! Let's save it on disk */
3326 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3327 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3328 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3329 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3330 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3331 zfree(out);
3332 return comprlen;
3333
3334 writeerr:
3335 zfree(out);
3336 return -1;
3337 }
3338
3339 /* Save a string objet as [len][data] on disk. If the object is a string
3340 * representation of an integer value we try to safe it in a special form */
3341 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3342 int enclen;
3343
3344 /* Try integer encoding */
3345 if (len <= 11) {
3346 unsigned char buf[5];
3347 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3348 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3349 return 0;
3350 }
3351 }
3352
3353 /* Try LZF compression - under 20 bytes it's unable to compress even
3354 * aaaaaaaaaaaaaaaaaa so skip it */
3355 if (server.rdbcompression && len > 20) {
3356 int retval;
3357
3358 retval = rdbSaveLzfStringObject(fp,s,len);
3359 if (retval == -1) return -1;
3360 if (retval > 0) return 0;
3361 /* retval == 0 means data can't be compressed, save the old way */
3362 }
3363
3364 /* Store verbatim */
3365 if (rdbSaveLen(fp,len) == -1) return -1;
3366 if (len && fwrite(s,len,1,fp) == 0) return -1;
3367 return 0;
3368 }
3369
3370 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3371 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3372 int retval;
3373
3374 /* Avoid incr/decr ref count business when possible.
3375 * This plays well with copy-on-write given that we are probably
3376 * in a child process (BGSAVE). Also this makes sure key objects
3377 * of swapped objects are not incRefCount-ed (an assert does not allow
3378 * this in order to avoid bugs) */
3379 if (obj->encoding != REDIS_ENCODING_RAW) {
3380 obj = getDecodedObject(obj);
3381 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3382 decrRefCount(obj);
3383 } else {
3384 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3385 }
3386 return retval;
3387 }
3388
3389 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3390 * 8 bit integer specifing the length of the representation.
3391 * This 8 bit integer has special values in order to specify the following
3392 * conditions:
3393 * 253: not a number
3394 * 254: + inf
3395 * 255: - inf
3396 */
3397 static int rdbSaveDoubleValue(FILE *fp, double val) {
3398 unsigned char buf[128];
3399 int len;
3400
3401 if (isnan(val)) {
3402 buf[0] = 253;
3403 len = 1;
3404 } else if (!isfinite(val)) {
3405 len = 1;
3406 buf[0] = (val < 0) ? 255 : 254;
3407 } else {
3408 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3409 buf[0] = strlen((char*)buf+1);
3410 len = buf[0]+1;
3411 }
3412 if (fwrite(buf,len,1,fp) == 0) return -1;
3413 return 0;
3414 }
3415
3416 /* Save a Redis object. */
3417 static int rdbSaveObject(FILE *fp, robj *o) {
3418 if (o->type == REDIS_STRING) {
3419 /* Save a string value */
3420 if (rdbSaveStringObject(fp,o) == -1) return -1;
3421 } else if (o->type == REDIS_LIST) {
3422 /* Save a list value */
3423 list *list = o->ptr;
3424 listIter li;
3425 listNode *ln;
3426
3427 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3428 listRewind(list,&li);
3429 while((ln = listNext(&li))) {
3430 robj *eleobj = listNodeValue(ln);
3431
3432 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3433 }
3434 } else if (o->type == REDIS_SET) {
3435 /* Save a set value */
3436 dict *set = o->ptr;
3437 dictIterator *di = dictGetIterator(set);
3438 dictEntry *de;
3439
3440 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3441 while((de = dictNext(di)) != NULL) {
3442 robj *eleobj = dictGetEntryKey(de);
3443
3444 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3445 }
3446 dictReleaseIterator(di);
3447 } else if (o->type == REDIS_ZSET) {
3448 /* Save a set value */
3449 zset *zs = o->ptr;
3450 dictIterator *di = dictGetIterator(zs->dict);
3451 dictEntry *de;
3452
3453 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3454 while((de = dictNext(di)) != NULL) {
3455 robj *eleobj = dictGetEntryKey(de);
3456 double *score = dictGetEntryVal(de);
3457
3458 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3459 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3460 }
3461 dictReleaseIterator(di);
3462 } else if (o->type == REDIS_HASH) {
3463 /* Save a hash value */
3464 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3465 unsigned char *p = zipmapRewind(o->ptr);
3466 unsigned int count = zipmapLen(o->ptr);
3467 unsigned char *key, *val;
3468 unsigned int klen, vlen;
3469
3470 if (rdbSaveLen(fp,count) == -1) return -1;
3471 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3472 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3473 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3474 }
3475 } else {
3476 dictIterator *di = dictGetIterator(o->ptr);
3477 dictEntry *de;
3478
3479 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3480 while((de = dictNext(di)) != NULL) {
3481 robj *key = dictGetEntryKey(de);
3482 robj *val = dictGetEntryVal(de);
3483
3484 if (rdbSaveStringObject(fp,key) == -1) return -1;
3485 if (rdbSaveStringObject(fp,val) == -1) return -1;
3486 }
3487 dictReleaseIterator(di);
3488 }
3489 } else {
3490 redisPanic("Unknown object type");
3491 }
3492 return 0;
3493 }
3494
3495 /* Return the length the object will have on disk if saved with
3496 * the rdbSaveObject() function. Currently we use a trick to get
3497 * this length with very little changes to the code. In the future
3498 * we could switch to a faster solution. */
3499 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3500 if (fp == NULL) fp = server.devnull;
3501 rewind(fp);
3502 assert(rdbSaveObject(fp,o) != 1);
3503 return ftello(fp);
3504 }
3505
3506 /* Return the number of pages required to save this object in the swap file */
3507 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3508 off_t bytes = rdbSavedObjectLen(o,fp);
3509
3510 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3511 }
3512
3513 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3514 static int rdbSave(char *filename) {
3515 dictIterator *di = NULL;
3516 dictEntry *de;
3517 FILE *fp;
3518 char tmpfile[256];
3519 int j;
3520 time_t now = time(NULL);
3521
3522 /* Wait for I/O therads to terminate, just in case this is a
3523 * foreground-saving, to avoid seeking the swap file descriptor at the
3524 * same time. */
3525 if (server.vm_enabled)
3526 waitEmptyIOJobsQueue();
3527
3528 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3529 fp = fopen(tmpfile,"w");
3530 if (!fp) {
3531 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3532 return REDIS_ERR;
3533 }
3534 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3535 for (j = 0; j < server.dbnum; j++) {
3536 redisDb *db = server.db+j;
3537 dict *d = db->dict;
3538 if (dictSize(d) == 0) continue;
3539 di = dictGetIterator(d);
3540 if (!di) {
3541 fclose(fp);
3542 return REDIS_ERR;
3543 }
3544
3545 /* Write the SELECT DB opcode */
3546 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3547 if (rdbSaveLen(fp,j) == -1) goto werr;
3548
3549 /* Iterate this DB writing every entry */
3550 while((de = dictNext(di)) != NULL) {
3551 robj *key = dictGetEntryKey(de);
3552 robj *o = dictGetEntryVal(de);
3553 time_t expiretime = getExpire(db,key);
3554
3555 /* Save the expire time */
3556 if (expiretime != -1) {
3557 /* If this key is already expired skip it */
3558 if (expiretime < now) continue;
3559 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3560 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3561 }
3562 /* Save the key and associated value. This requires special
3563 * handling if the value is swapped out. */
3564 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3565 key->storage == REDIS_VM_SWAPPING) {
3566 /* Save type, key, value */
3567 if (rdbSaveType(fp,o->type) == -1) goto werr;
3568 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3569 if (rdbSaveObject(fp,o) == -1) goto werr;
3570 } else {
3571 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3572 robj *po;
3573 /* Get a preview of the object in memory */
3574 po = vmPreviewObject(key);
3575 /* Save type, key, value */
3576 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3577 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3578 if (rdbSaveObject(fp,po) == -1) goto werr;
3579 /* Remove the loaded object from memory */
3580 decrRefCount(po);
3581 }
3582 }
3583 dictReleaseIterator(di);
3584 }
3585 /* EOF opcode */
3586 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3587
3588 /* Make sure data will not remain on the OS's output buffers */
3589 fflush(fp);
3590 fsync(fileno(fp));
3591 fclose(fp);
3592
3593 /* Use RENAME to make sure the DB file is changed atomically only
3594 * if the generate DB file is ok. */
3595 if (rename(tmpfile,filename) == -1) {
3596 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3597 unlink(tmpfile);
3598 return REDIS_ERR;
3599 }
3600 redisLog(REDIS_NOTICE,"DB saved on disk");
3601 server.dirty = 0;
3602 server.lastsave = time(NULL);
3603 return REDIS_OK;
3604
3605 werr:
3606 fclose(fp);
3607 unlink(tmpfile);
3608 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3609 if (di) dictReleaseIterator(di);
3610 return REDIS_ERR;
3611 }
3612
3613 static int rdbSaveBackground(char *filename) {
3614 pid_t childpid;
3615
3616 if (server.bgsavechildpid != -1) return REDIS_ERR;
3617 if (server.vm_enabled) waitEmptyIOJobsQueue();
3618 if ((childpid = fork()) == 0) {
3619 /* Child */
3620 if (server.vm_enabled) vmReopenSwapFile();
3621 close(server.fd);
3622 if (rdbSave(filename) == REDIS_OK) {
3623 _exit(0);
3624 } else {
3625 _exit(1);
3626 }
3627 } else {
3628 /* Parent */
3629 if (childpid == -1) {
3630 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3631 strerror(errno));
3632 return REDIS_ERR;
3633 }
3634 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3635 server.bgsavechildpid = childpid;
3636 updateDictResizePolicy();
3637 return REDIS_OK;
3638 }
3639 return REDIS_OK; /* unreached */
3640 }
3641
3642 static void rdbRemoveTempFile(pid_t childpid) {
3643 char tmpfile[256];
3644
3645 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3646 unlink(tmpfile);
3647 }
3648
3649 static int rdbLoadType(FILE *fp) {
3650 unsigned char type;
3651 if (fread(&type,1,1,fp) == 0) return -1;
3652 return type;
3653 }
3654
3655 static time_t rdbLoadTime(FILE *fp) {
3656 int32_t t32;
3657 if (fread(&t32,4,1,fp) == 0) return -1;
3658 return (time_t) t32;
3659 }
3660
3661 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3662 * of this file for a description of how this are stored on disk.
3663 *
3664 * isencoded is set to 1 if the readed length is not actually a length but
3665 * an "encoding type", check the above comments for more info */
3666 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3667 unsigned char buf[2];
3668 uint32_t len;
3669 int type;
3670
3671 if (isencoded) *isencoded = 0;
3672 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3673 type = (buf[0]&0xC0)>>6;
3674 if (type == REDIS_RDB_6BITLEN) {
3675 /* Read a 6 bit len */
3676 return buf[0]&0x3F;
3677 } else if (type == REDIS_RDB_ENCVAL) {
3678 /* Read a 6 bit len encoding type */
3679 if (isencoded) *isencoded = 1;
3680 return buf[0]&0x3F;
3681 } else if (type == REDIS_RDB_14BITLEN) {
3682 /* Read a 14 bit len */
3683 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3684 return ((buf[0]&0x3F)<<8)|buf[1];
3685 } else {
3686 /* Read a 32 bit len */
3687 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3688 return ntohl(len);
3689 }
3690 }
3691
3692 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3693 unsigned char enc[4];
3694 long long val;
3695
3696 if (enctype == REDIS_RDB_ENC_INT8) {
3697 if (fread(enc,1,1,fp) == 0) return NULL;
3698 val = (signed char)enc[0];
3699 } else if (enctype == REDIS_RDB_ENC_INT16) {
3700 uint16_t v;
3701 if (fread(enc,2,1,fp) == 0) return NULL;
3702 v = enc[0]|(enc[1]<<8);
3703 val = (int16_t)v;
3704 } else if (enctype == REDIS_RDB_ENC_INT32) {
3705 uint32_t v;
3706 if (fread(enc,4,1,fp) == 0) return NULL;
3707 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3708 val = (int32_t)v;
3709 } else {
3710 val = 0; /* anti-warning */
3711 redisPanic("Unknown RDB integer encoding type");
3712 }
3713 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3714 }
3715
3716 static robj *rdbLoadLzfStringObject(FILE*fp) {
3717 unsigned int len, clen;
3718 unsigned char *c = NULL;
3719 sds val = NULL;
3720
3721 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3722 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3723 if ((c = zmalloc(clen)) == NULL) goto err;
3724 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3725 if (fread(c,clen,1,fp) == 0) goto err;
3726 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3727 zfree(c);
3728 return createObject(REDIS_STRING,val);
3729 err:
3730 zfree(c);
3731 sdsfree(val);
3732 return NULL;
3733 }
3734
3735 static robj *rdbLoadStringObject(FILE*fp) {
3736 int isencoded;
3737 uint32_t len;
3738 sds val;
3739
3740 len = rdbLoadLen(fp,&isencoded);
3741 if (isencoded) {
3742 switch(len) {
3743 case REDIS_RDB_ENC_INT8:
3744 case REDIS_RDB_ENC_INT16:
3745 case REDIS_RDB_ENC_INT32:
3746 return rdbLoadIntegerObject(fp,len);
3747 case REDIS_RDB_ENC_LZF:
3748 return rdbLoadLzfStringObject(fp);
3749 default:
3750 redisPanic("Unknown RDB encoding type");
3751 }
3752 }
3753
3754 if (len == REDIS_RDB_LENERR) return NULL;
3755 val = sdsnewlen(NULL,len);
3756 if (len && fread(val,len,1,fp) == 0) {
3757 sdsfree(val);
3758 return NULL;
3759 }
3760 return createObject(REDIS_STRING,val);
3761 }
3762
3763 /* For information about double serialization check rdbSaveDoubleValue() */
3764 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3765 char buf[128];
3766 unsigned char len;
3767
3768 if (fread(&len,1,1,fp) == 0) return -1;
3769 switch(len) {
3770 case 255: *val = R_NegInf; return 0;
3771 case 254: *val = R_PosInf; return 0;
3772 case 253: *val = R_Nan; return 0;
3773 default:
3774 if (fread(buf,len,1,fp) == 0) return -1;
3775 buf[len] = '\0';
3776 sscanf(buf, "%lg", val);
3777 return 0;
3778 }
3779 }
3780
3781 /* Load a Redis object of the specified type from the specified file.
3782 * On success a newly allocated object is returned, otherwise NULL. */
3783 static robj *rdbLoadObject(int type, FILE *fp) {
3784 robj *o;
3785
3786 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3787 if (type == REDIS_STRING) {
3788 /* Read string value */
3789 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3790 o = tryObjectEncoding(o);
3791 } else if (type == REDIS_LIST || type == REDIS_SET) {
3792 /* Read list/set value */
3793 uint32_t listlen;
3794
3795 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3796 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3797 /* It's faster to expand the dict to the right size asap in order
3798 * to avoid rehashing */
3799 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3800 dictExpand(o->ptr,listlen);
3801 /* Load every single element of the list/set */
3802 while(listlen--) {
3803 robj *ele;
3804
3805 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3806 ele = tryObjectEncoding(ele);
3807 if (type == REDIS_LIST) {
3808 listAddNodeTail((list*)o->ptr,ele);
3809 } else {
3810 dictAdd((dict*)o->ptr,ele,NULL);
3811 }
3812 }
3813 } else if (type == REDIS_ZSET) {
3814 /* Read list/set value */
3815 size_t zsetlen;
3816 zset *zs;
3817
3818 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3819 o = createZsetObject();
3820 zs = o->ptr;
3821 /* Load every single element of the list/set */
3822 while(zsetlen--) {
3823 robj *ele;
3824 double *score = zmalloc(sizeof(double));
3825
3826 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3827 ele = tryObjectEncoding(ele);
3828 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3829 dictAdd(zs->dict,ele,score);
3830 zslInsert(zs->zsl,*score,ele);
3831 incrRefCount(ele); /* added to skiplist */
3832 }
3833 } else if (type == REDIS_HASH) {
3834 size_t hashlen;
3835
3836 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3837 o = createHashObject();
3838 /* Too many entries? Use an hash table. */
3839 if (hashlen > server.hash_max_zipmap_entries)
3840 convertToRealHash(o);
3841 /* Load every key/value, then set it into the zipmap or hash
3842 * table, as needed. */
3843 while(hashlen--) {
3844 robj *key, *val;
3845
3846 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3847 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3848 /* If we are using a zipmap and there are too big values
3849 * the object is converted to real hash table encoding. */
3850 if (o->encoding != REDIS_ENCODING_HT &&
3851 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3852 sdslen(val->ptr) > server.hash_max_zipmap_value))
3853 {
3854 convertToRealHash(o);
3855 }
3856
3857 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3858 unsigned char *zm = o->ptr;
3859
3860 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3861 val->ptr,sdslen(val->ptr),NULL);
3862 o->ptr = zm;
3863 decrRefCount(key);
3864 decrRefCount(val);
3865 } else {
3866 key = tryObjectEncoding(key);
3867 val = tryObjectEncoding(val);
3868 dictAdd((dict*)o->ptr,key,val);
3869 }
3870 }
3871 } else {
3872 redisPanic("Unknown object type");
3873 }
3874 return o;
3875 }
3876
3877 static int rdbLoad(char *filename) {
3878 FILE *fp;
3879 robj *keyobj = NULL;
3880 uint32_t dbid;
3881 int type, retval, rdbver;
3882 dict *d = server.db[0].dict;
3883 redisDb *db = server.db+0;
3884 char buf[1024];
3885 time_t expiretime = -1, now = time(NULL);
3886 long long loadedkeys = 0;
3887
3888 fp = fopen(filename,"r");
3889 if (!fp) return REDIS_ERR;
3890 if (fread(buf,9,1,fp) == 0) goto eoferr;
3891 buf[9] = '\0';
3892 if (memcmp(buf,"REDIS",5) != 0) {
3893 fclose(fp);
3894 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3895 return REDIS_ERR;
3896 }
3897 rdbver = atoi(buf+5);
3898 if (rdbver != 1) {
3899 fclose(fp);
3900 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3901 return REDIS_ERR;
3902 }
3903 while(1) {
3904 robj *o;
3905
3906 /* Read type. */
3907 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3908 if (type == REDIS_EXPIRETIME) {
3909 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3910 /* We read the time so we need to read the object type again */
3911 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3912 }
3913 if (type == REDIS_EOF) break;
3914 /* Handle SELECT DB opcode as a special case */
3915 if (type == REDIS_SELECTDB) {
3916 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3917 goto eoferr;
3918 if (dbid >= (unsigned)server.dbnum) {
3919 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3920 exit(1);
3921 }
3922 db = server.db+dbid;
3923 d = db->dict;
3924 continue;
3925 }
3926 /* Read key */
3927 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3928 /* Read value */
3929 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3930 /* Add the new object in the hash table */
3931 retval = dictAdd(d,keyobj,o);
3932 if (retval == DICT_ERR) {
3933 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3934 exit(1);
3935 }
3936 /* Set the expire time if needed */
3937 if (expiretime != -1) {
3938 setExpire(db,keyobj,expiretime);
3939 /* Delete this key if already expired */
3940 if (expiretime < now) deleteKey(db,keyobj);
3941 expiretime = -1;
3942 }
3943 keyobj = o = NULL;
3944 /* Handle swapping while loading big datasets when VM is on */
3945 loadedkeys++;
3946 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3947 while (zmalloc_used_memory() > server.vm_max_memory) {
3948 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3949 }
3950 }
3951 }
3952 fclose(fp);
3953 return REDIS_OK;
3954
3955 eoferr: /* unexpected end of file is handled here with a fatal exit */
3956 if (keyobj) decrRefCount(keyobj);
3957 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3958 exit(1);
3959 return REDIS_ERR; /* Just to avoid warning */
3960 }
3961
3962 /*================================== Commands =============================== */
3963
3964 static void authCommand(redisClient *c) {
3965 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3966 c->authenticated = 1;
3967 addReply(c,shared.ok);
3968 } else {
3969 c->authenticated = 0;
3970 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3971 }
3972 }
3973
3974 static void pingCommand(redisClient *c) {
3975 addReply(c,shared.pong);
3976 }
3977
3978 static void echoCommand(redisClient *c) {
3979 addReplyBulk(c,c->argv[1]);
3980 }
3981
3982 /*=================================== Strings =============================== */
3983
3984 static void setGenericCommand(redisClient *c, int nx) {
3985 int retval;
3986
3987 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3988 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3989 if (retval == DICT_ERR) {
3990 if (!nx) {
3991 /* If the key is about a swapped value, we want a new key object
3992 * to overwrite the old. So we delete the old key in the database.
3993 * This will also make sure that swap pages about the old object
3994 * will be marked as free. */
3995 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3996 incrRefCount(c->argv[1]);
3997 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3998 incrRefCount(c->argv[2]);
3999 } else {
4000 addReply(c,shared.czero);
4001 return;
4002 }
4003 } else {
4004 incrRefCount(c->argv[1]);
4005 incrRefCount(c->argv[2]);
4006 }
4007 server.dirty++;
4008 removeExpire(c->db,c->argv[1]);
4009 addReply(c, nx ? shared.cone : shared.ok);
4010 }
4011
4012 static void setCommand(redisClient *c) {
4013 setGenericCommand(c,0);
4014 }
4015
4016 static void setnxCommand(redisClient *c) {
4017 setGenericCommand(c,1);
4018 }
4019
4020 static int getGenericCommand(redisClient *c) {
4021 robj *o;
4022
4023 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
4024 return REDIS_OK;
4025
4026 if (o->type != REDIS_STRING) {
4027 addReply(c,shared.wrongtypeerr);
4028 return REDIS_ERR;
4029 } else {
4030 addReplyBulk(c,o);
4031 return REDIS_OK;
4032 }
4033 }
4034
4035 static void getCommand(redisClient *c) {
4036 getGenericCommand(c);
4037 }
4038
4039 static void getsetCommand(redisClient *c) {
4040 if (getGenericCommand(c) == REDIS_ERR) return;
4041 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4042 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4043 } else {
4044 incrRefCount(c->argv[1]);
4045 }
4046 incrRefCount(c->argv[2]);
4047 server.dirty++;
4048 removeExpire(c->db,c->argv[1]);
4049 }
4050
4051 static void mgetCommand(redisClient *c) {
4052 int j;
4053
4054 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4055 for (j = 1; j < c->argc; j++) {
4056 robj *o = lookupKeyRead(c->db,c->argv[j]);
4057 if (o == NULL) {
4058 addReply(c,shared.nullbulk);
4059 } else {
4060 if (o->type != REDIS_STRING) {
4061 addReply(c,shared.nullbulk);
4062 } else {
4063 addReplyBulk(c,o);
4064 }
4065 }
4066 }
4067 }
4068
4069 static void msetGenericCommand(redisClient *c, int nx) {
4070 int j, busykeys = 0;
4071
4072 if ((c->argc % 2) == 0) {
4073 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4074 return;
4075 }
4076 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4077 * set nothing at all if at least one already key exists. */
4078 if (nx) {
4079 for (j = 1; j < c->argc; j += 2) {
4080 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4081 busykeys++;
4082 }
4083 }
4084 }
4085 if (busykeys) {
4086 addReply(c, shared.czero);
4087 return;
4088 }
4089
4090 for (j = 1; j < c->argc; j += 2) {
4091 int retval;
4092
4093 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4094 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4095 if (retval == DICT_ERR) {
4096 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4097 incrRefCount(c->argv[j+1]);
4098 } else {
4099 incrRefCount(c->argv[j]);
4100 incrRefCount(c->argv[j+1]);
4101 }
4102 removeExpire(c->db,c->argv[j]);
4103 }
4104 server.dirty += (c->argc-1)/2;
4105 addReply(c, nx ? shared.cone : shared.ok);
4106 }
4107
4108 static void msetCommand(redisClient *c) {
4109 msetGenericCommand(c,0);
4110 }
4111
4112 static void msetnxCommand(redisClient *c) {
4113 msetGenericCommand(c,1);
4114 }
4115
4116 static void incrDecrCommand(redisClient *c, long long incr) {
4117 long long value;
4118 int retval;
4119 robj *o;
4120
4121 o = lookupKeyWrite(c->db,c->argv[1]);
4122
4123 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
4124
4125 value += incr;
4126 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4127 o = tryObjectEncoding(o);
4128 retval = dictAdd(c->db->dict,c->argv[1],o);
4129 if (retval == DICT_ERR) {
4130 dictReplace(c->db->dict,c->argv[1],o);
4131 removeExpire(c->db,c->argv[1]);
4132 } else {
4133 incrRefCount(c->argv[1]);
4134 }
4135 server.dirty++;
4136 addReply(c,shared.colon);
4137 addReply(c,o);
4138 addReply(c,shared.crlf);
4139 }
4140
4141 static void incrCommand(redisClient *c) {
4142 incrDecrCommand(c,1);
4143 }
4144
4145 static void decrCommand(redisClient *c) {
4146 incrDecrCommand(c,-1);
4147 }
4148
4149 static void incrbyCommand(redisClient *c) {
4150 long long incr;
4151
4152 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4153 incrDecrCommand(c,incr);
4154 }
4155
4156 static void decrbyCommand(redisClient *c) {
4157 long long incr;
4158
4159 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4160 incrDecrCommand(c,-incr);
4161 }
4162
4163 static void appendCommand(redisClient *c) {
4164 int retval;
4165 size_t totlen;
4166 robj *o;
4167
4168 o = lookupKeyWrite(c->db,c->argv[1]);
4169 if (o == NULL) {
4170 /* Create the key */
4171 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4172 incrRefCount(c->argv[1]);
4173 incrRefCount(c->argv[2]);
4174 totlen = stringObjectLen(c->argv[2]);
4175 } else {
4176 dictEntry *de;
4177
4178 de = dictFind(c->db->dict,c->argv[1]);
4179 assert(de != NULL);
4180
4181 o = dictGetEntryVal(de);
4182 if (o->type != REDIS_STRING) {
4183 addReply(c,shared.wrongtypeerr);
4184 return;
4185 }
4186 /* If the object is specially encoded or shared we have to make
4187 * a copy */
4188 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4189 robj *decoded = getDecodedObject(o);
4190
4191 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4192 decrRefCount(decoded);
4193 dictReplace(c->db->dict,c->argv[1],o);
4194 }
4195 /* APPEND! */
4196 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4197 o->ptr = sdscatlen(o->ptr,
4198 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4199 } else {
4200 o->ptr = sdscatprintf(o->ptr, "%ld",
4201 (unsigned long) c->argv[2]->ptr);
4202 }
4203 totlen = sdslen(o->ptr);
4204 }
4205 server.dirty++;
4206 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4207 }
4208
4209 static void substrCommand(redisClient *c) {
4210 robj *o;
4211 long start = atoi(c->argv[2]->ptr);
4212 long end = atoi(c->argv[3]->ptr);
4213 size_t rangelen, strlen;
4214 sds range;
4215
4216 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4217 checkType(c,o,REDIS_STRING)) return;
4218
4219 o = getDecodedObject(o);
4220 strlen = sdslen(o->ptr);
4221
4222 /* convert negative indexes */
4223 if (start < 0) start = strlen+start;
4224 if (end < 0) end = strlen+end;
4225 if (start < 0) start = 0;
4226 if (end < 0) end = 0;
4227
4228 /* indexes sanity checks */
4229 if (start > end || (size_t)start >= strlen) {
4230 /* Out of range start or start > end result in null reply */
4231 addReply(c,shared.nullbulk);
4232 decrRefCount(o);
4233 return;
4234 }
4235 if ((size_t)end >= strlen) end = strlen-1;
4236 rangelen = (end-start)+1;
4237
4238 /* Return the result */
4239 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4240 range = sdsnewlen((char*)o->ptr+start,rangelen);
4241 addReplySds(c,range);
4242 addReply(c,shared.crlf);
4243 decrRefCount(o);
4244 }
4245
4246 /* ========================= Type agnostic commands ========================= */
4247
4248 static void delCommand(redisClient *c) {
4249 int deleted = 0, j;
4250
4251 for (j = 1; j < c->argc; j++) {
4252 if (deleteKey(c->db,c->argv[j])) {
4253 server.dirty++;
4254 deleted++;
4255 }
4256 }
4257 addReplyLong(c,deleted);
4258 }
4259
4260 static void existsCommand(redisClient *c) {
4261 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4262 }
4263
4264 static void selectCommand(redisClient *c) {
4265 int id = atoi(c->argv[1]->ptr);
4266
4267 if (selectDb(c,id) == REDIS_ERR) {
4268 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4269 } else {
4270 addReply(c,shared.ok);
4271 }
4272 }
4273
4274 static void randomkeyCommand(redisClient *c) {
4275 dictEntry *de;
4276 robj *key;
4277
4278 while(1) {
4279 de = dictGetRandomKey(c->db->dict);
4280 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4281 }
4282
4283 if (de == NULL) {
4284 addReply(c,shared.nullbulk);
4285 return;
4286 }
4287
4288 key = dictGetEntryKey(de);
4289 if (server.vm_enabled) {
4290 key = dupStringObject(key);
4291 addReplyBulk(c,key);
4292 decrRefCount(key);
4293 } else {
4294 addReplyBulk(c,key);
4295 }
4296 }
4297
4298 static void keysCommand(redisClient *c) {
4299 dictIterator *di;
4300 dictEntry *de;
4301 sds pattern = c->argv[1]->ptr;
4302 int plen = sdslen(pattern);
4303 unsigned long numkeys = 0;
4304 robj *lenobj = createObject(REDIS_STRING,NULL);
4305
4306 di = dictGetIterator(c->db->dict);
4307 addReply(c,lenobj);
4308 decrRefCount(lenobj);
4309 while((de = dictNext(di)) != NULL) {
4310 robj *keyobj = dictGetEntryKey(de);
4311
4312 sds key = keyobj->ptr;
4313 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4314 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4315 if (expireIfNeeded(c->db,keyobj) == 0) {
4316 addReplyBulk(c,keyobj);
4317 numkeys++;
4318 }
4319 }
4320 }
4321 dictReleaseIterator(di);
4322 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4323 }
4324
4325 static void dbsizeCommand(redisClient *c) {
4326 addReplySds(c,
4327 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4328 }
4329
4330 static void lastsaveCommand(redisClient *c) {
4331 addReplySds(c,
4332 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4333 }
4334
4335 static void typeCommand(redisClient *c) {
4336 robj *o;
4337 char *type;
4338
4339 o = lookupKeyRead(c->db,c->argv[1]);
4340 if (o == NULL) {
4341 type = "+none";
4342 } else {
4343 switch(o->type) {
4344 case REDIS_STRING: type = "+string"; break;
4345 case REDIS_LIST: type = "+list"; break;
4346 case REDIS_SET: type = "+set"; break;
4347 case REDIS_ZSET: type = "+zset"; break;
4348 case REDIS_HASH: type = "+hash"; break;
4349 default: type = "+unknown"; break;
4350 }
4351 }
4352 addReplySds(c,sdsnew(type));
4353 addReply(c,shared.crlf);
4354 }
4355
4356 static void saveCommand(redisClient *c) {
4357 if (server.bgsavechildpid != -1) {
4358 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4359 return;
4360 }
4361 if (rdbSave(server.dbfilename) == REDIS_OK) {
4362 addReply(c,shared.ok);
4363 } else {
4364 addReply(c,shared.err);
4365 }
4366 }
4367
4368 static void bgsaveCommand(redisClient *c) {
4369 if (server.bgsavechildpid != -1) {
4370 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4371 return;
4372 }
4373 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4374 char *status = "+Background saving started\r\n";
4375 addReplySds(c,sdsnew(status));
4376 } else {
4377 addReply(c,shared.err);
4378 }
4379 }
4380
4381 static void shutdownCommand(redisClient *c) {
4382 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4383 /* Kill the saving child if there is a background saving in progress.
4384 We want to avoid race conditions, for instance our saving child may
4385 overwrite the synchronous saving did by SHUTDOWN. */
4386 if (server.bgsavechildpid != -1) {
4387 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4388 kill(server.bgsavechildpid,SIGKILL);
4389 rdbRemoveTempFile(server.bgsavechildpid);
4390 }
4391 if (server.appendonly) {
4392 /* Append only file: fsync() the AOF and exit */
4393 fsync(server.appendfd);
4394 if (server.vm_enabled) unlink(server.vm_swap_file);
4395 exit(0);
4396 } else {
4397 /* Snapshotting. Perform a SYNC SAVE and exit */
4398 if (rdbSave(server.dbfilename) == REDIS_OK) {
4399 if (server.daemonize)
4400 unlink(server.pidfile);
4401 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4402 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4403 if (server.vm_enabled) unlink(server.vm_swap_file);
4404 exit(0);
4405 } else {
4406 /* Ooops.. error saving! The best we can do is to continue
4407 * operating. Note that if there was a background saving process,
4408 * in the next cron() Redis will be notified that the background
4409 * saving aborted, handling special stuff like slaves pending for
4410 * synchronization... */
4411 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4412 addReplySds(c,
4413 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4414 }
4415 }
4416 }
4417
4418 static void renameGenericCommand(redisClient *c, int nx) {
4419 robj *o;
4420
4421 /* To use the same key as src and dst is probably an error */
4422 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4423 addReply(c,shared.sameobjecterr);
4424 return;
4425 }
4426
4427 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4428 return;
4429
4430 incrRefCount(o);
4431 deleteIfVolatile(c->db,c->argv[2]);
4432 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4433 if (nx) {
4434 decrRefCount(o);
4435 addReply(c,shared.czero);
4436 return;
4437 }
4438 dictReplace(c->db->dict,c->argv[2],o);
4439 } else {
4440 incrRefCount(c->argv[2]);
4441 }
4442 deleteKey(c->db,c->argv[1]);
4443 server.dirty++;
4444 addReply(c,nx ? shared.cone : shared.ok);
4445 }
4446
4447 static void renameCommand(redisClient *c) {
4448 renameGenericCommand(c,0);
4449 }
4450
4451 static void renamenxCommand(redisClient *c) {
4452 renameGenericCommand(c,1);
4453 }
4454
4455 static void moveCommand(redisClient *c) {
4456 robj *o;
4457 redisDb *src, *dst;
4458 int srcid;
4459
4460 /* Obtain source and target DB pointers */
4461 src = c->db;
4462 srcid = c->db->id;
4463 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4464 addReply(c,shared.outofrangeerr);
4465 return;
4466 }
4467 dst = c->db;
4468 selectDb(c,srcid); /* Back to the source DB */
4469
4470 /* If the user is moving using as target the same
4471 * DB as the source DB it is probably an error. */
4472 if (src == dst) {
4473 addReply(c,shared.sameobjecterr);
4474 return;
4475 }
4476
4477 /* Check if the element exists and get a reference */
4478 o = lookupKeyWrite(c->db,c->argv[1]);
4479 if (!o) {
4480 addReply(c,shared.czero);
4481 return;
4482 }
4483
4484 /* Try to add the element to the target DB */
4485 deleteIfVolatile(dst,c->argv[1]);
4486 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4487 addReply(c,shared.czero);
4488 return;
4489 }
4490 incrRefCount(c->argv[1]);
4491 incrRefCount(o);
4492
4493 /* OK! key moved, free the entry in the source DB */
4494 deleteKey(src,c->argv[1]);
4495 server.dirty++;
4496 addReply(c,shared.cone);
4497 }
4498
4499 /* =================================== Lists ================================ */
4500 static void pushGenericCommand(redisClient *c, int where) {
4501 robj *lobj;
4502 list *list;
4503
4504 lobj = lookupKeyWrite(c->db,c->argv[1]);
4505 if (lobj == NULL) {
4506 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4507 addReply(c,shared.cone);
4508 return;
4509 }
4510 lobj = createListObject();
4511 list = lobj->ptr;
4512 if (where == REDIS_HEAD) {
4513 listAddNodeHead(list,c->argv[2]);
4514 } else {
4515 listAddNodeTail(list,c->argv[2]);
4516 }
4517 dictAdd(c->db->dict,c->argv[1],lobj);
4518 incrRefCount(c->argv[1]);
4519 incrRefCount(c->argv[2]);
4520 } else {
4521 if (lobj->type != REDIS_LIST) {
4522 addReply(c,shared.wrongtypeerr);
4523 return;
4524 }
4525 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4526 addReply(c,shared.cone);
4527 return;
4528 }
4529 list = lobj->ptr;
4530 if (where == REDIS_HEAD) {
4531 listAddNodeHead(list,c->argv[2]);
4532 } else {
4533 listAddNodeTail(list,c->argv[2]);
4534 }
4535 incrRefCount(c->argv[2]);
4536 }
4537 server.dirty++;
4538 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4539 }
4540
4541 static void lpushCommand(redisClient *c) {
4542 pushGenericCommand(c,REDIS_HEAD);
4543 }
4544
4545 static void rpushCommand(redisClient *c) {
4546 pushGenericCommand(c,REDIS_TAIL);
4547 }
4548
4549 static void llenCommand(redisClient *c) {
4550 robj *o;
4551 list *l;
4552
4553 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4554 checkType(c,o,REDIS_LIST)) return;
4555
4556 l = o->ptr;
4557 addReplyUlong(c,listLength(l));
4558 }
4559
4560 static void lindexCommand(redisClient *c) {
4561 robj *o;
4562 int index = atoi(c->argv[2]->ptr);
4563 list *list;
4564 listNode *ln;
4565
4566 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4567 checkType(c,o,REDIS_LIST)) return;
4568 list = o->ptr;
4569
4570 ln = listIndex(list, index);
4571 if (ln == NULL) {
4572 addReply(c,shared.nullbulk);
4573 } else {
4574 robj *ele = listNodeValue(ln);
4575 addReplyBulk(c,ele);
4576 }
4577 }
4578
4579 static void lsetCommand(redisClient *c) {
4580 robj *o;
4581 int index = atoi(c->argv[2]->ptr);
4582 list *list;
4583 listNode *ln;
4584
4585 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4586 checkType(c,o,REDIS_LIST)) return;
4587 list = o->ptr;
4588
4589 ln = listIndex(list, index);
4590 if (ln == NULL) {
4591 addReply(c,shared.outofrangeerr);
4592 } else {
4593 robj *ele = listNodeValue(ln);
4594
4595 decrRefCount(ele);
4596 listNodeValue(ln) = c->argv[3];
4597 incrRefCount(c->argv[3]);
4598 addReply(c,shared.ok);
4599 server.dirty++;
4600 }
4601 }
4602
4603 static void popGenericCommand(redisClient *c, int where) {
4604 robj *o;
4605 list *list;
4606 listNode *ln;
4607
4608 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4609 checkType(c,o,REDIS_LIST)) return;
4610 list = o->ptr;
4611
4612 if (where == REDIS_HEAD)
4613 ln = listFirst(list);
4614 else
4615 ln = listLast(list);
4616
4617 if (ln == NULL) {
4618 addReply(c,shared.nullbulk);
4619 } else {
4620 robj *ele = listNodeValue(ln);
4621 addReplyBulk(c,ele);
4622 listDelNode(list,ln);
4623 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4624 server.dirty++;
4625 }
4626 }
4627
4628 static void lpopCommand(redisClient *c) {
4629 popGenericCommand(c,REDIS_HEAD);
4630 }
4631
4632 static void rpopCommand(redisClient *c) {
4633 popGenericCommand(c,REDIS_TAIL);
4634 }
4635
4636 static void lrangeCommand(redisClient *c) {
4637 robj *o;
4638 int start = atoi(c->argv[2]->ptr);
4639 int end = atoi(c->argv[3]->ptr);
4640 int llen;
4641 int rangelen, j;
4642 list *list;
4643 listNode *ln;
4644 robj *ele;
4645
4646 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4647 || checkType(c,o,REDIS_LIST)) return;
4648 list = o->ptr;
4649 llen = listLength(list);
4650
4651 /* convert negative indexes */
4652 if (start < 0) start = llen+start;
4653 if (end < 0) end = llen+end;
4654 if (start < 0) start = 0;
4655 if (end < 0) end = 0;
4656
4657 /* indexes sanity checks */
4658 if (start > end || start >= llen) {
4659 /* Out of range start or start > end result in empty list */
4660 addReply(c,shared.emptymultibulk);
4661 return;
4662 }
4663 if (end >= llen) end = llen-1;
4664 rangelen = (end-start)+1;
4665
4666 /* Return the result in form of a multi-bulk reply */
4667 ln = listIndex(list, start);
4668 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4669 for (j = 0; j < rangelen; j++) {
4670 ele = listNodeValue(ln);
4671 addReplyBulk(c,ele);
4672 ln = ln->next;
4673 }
4674 }
4675
4676 static void ltrimCommand(redisClient *c) {
4677 robj *o;
4678 int start = atoi(c->argv[2]->ptr);
4679 int end = atoi(c->argv[3]->ptr);
4680 int llen;
4681 int j, ltrim, rtrim;
4682 list *list;
4683 listNode *ln;
4684
4685 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4686 checkType(c,o,REDIS_LIST)) return;
4687 list = o->ptr;
4688 llen = listLength(list);
4689
4690 /* convert negative indexes */
4691 if (start < 0) start = llen+start;
4692 if (end < 0) end = llen+end;
4693 if (start < 0) start = 0;
4694 if (end < 0) end = 0;
4695
4696 /* indexes sanity checks */
4697 if (start > end || start >= llen) {
4698 /* Out of range start or start > end result in empty list */
4699 ltrim = llen;
4700 rtrim = 0;
4701 } else {
4702 if (end >= llen) end = llen-1;
4703 ltrim = start;
4704 rtrim = llen-end-1;
4705 }
4706
4707 /* Remove list elements to perform the trim */
4708 for (j = 0; j < ltrim; j++) {
4709 ln = listFirst(list);
4710 listDelNode(list,ln);
4711 }
4712 for (j = 0; j < rtrim; j++) {
4713 ln = listLast(list);
4714 listDelNode(list,ln);
4715 }
4716 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4717 server.dirty++;
4718 addReply(c,shared.ok);
4719 }
4720
4721 static void lremCommand(redisClient *c) {
4722 robj *o;
4723 list *list;
4724 listNode *ln, *next;
4725 int toremove = atoi(c->argv[2]->ptr);
4726 int removed = 0;
4727 int fromtail = 0;
4728
4729 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4730 checkType(c,o,REDIS_LIST)) return;
4731 list = o->ptr;
4732
4733 if (toremove < 0) {
4734 toremove = -toremove;
4735 fromtail = 1;
4736 }
4737 ln = fromtail ? list->tail : list->head;
4738 while (ln) {
4739 robj *ele = listNodeValue(ln);
4740
4741 next = fromtail ? ln->prev : ln->next;
4742 if (compareStringObjects(ele,c->argv[3]) == 0) {
4743 listDelNode(list,ln);
4744 server.dirty++;
4745 removed++;
4746 if (toremove && removed == toremove) break;
4747 }
4748 ln = next;
4749 }
4750 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4751 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4752 }
4753
4754 /* This is the semantic of this command:
4755 * RPOPLPUSH srclist dstlist:
4756 * IF LLEN(srclist) > 0
4757 * element = RPOP srclist
4758 * LPUSH dstlist element
4759 * RETURN element
4760 * ELSE
4761 * RETURN nil
4762 * END
4763 * END
4764 *
4765 * The idea is to be able to get an element from a list in a reliable way
4766 * since the element is not just returned but pushed against another list
4767 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4768 */
4769 static void rpoplpushcommand(redisClient *c) {
4770 robj *sobj;
4771 list *srclist;
4772 listNode *ln;
4773
4774 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4775 checkType(c,sobj,REDIS_LIST)) return;
4776 srclist = sobj->ptr;
4777 ln = listLast(srclist);
4778
4779 if (ln == NULL) {
4780 addReply(c,shared.nullbulk);
4781 } else {
4782 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4783 robj *ele = listNodeValue(ln);
4784 list *dstlist;
4785
4786 if (dobj && dobj->type != REDIS_LIST) {
4787 addReply(c,shared.wrongtypeerr);
4788 return;
4789 }
4790
4791 /* Add the element to the target list (unless it's directly
4792 * passed to some BLPOP-ing client */
4793 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4794 if (dobj == NULL) {
4795 /* Create the list if the key does not exist */
4796 dobj = createListObject();
4797 dictAdd(c->db->dict,c->argv[2],dobj);
4798 incrRefCount(c->argv[2]);
4799 }
4800 dstlist = dobj->ptr;
4801 listAddNodeHead(dstlist,ele);
4802 incrRefCount(ele);
4803 }
4804
4805 /* Send the element to the client as reply as well */
4806 addReplyBulk(c,ele);
4807
4808 /* Finally remove the element from the source list */
4809 listDelNode(srclist,ln);
4810 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4811 server.dirty++;
4812 }
4813 }
4814
4815 /* ==================================== Sets ================================ */
4816
4817 static void saddCommand(redisClient *c) {
4818 robj *set;
4819
4820 set = lookupKeyWrite(c->db,c->argv[1]);
4821 if (set == NULL) {
4822 set = createSetObject();
4823 dictAdd(c->db->dict,c->argv[1],set);
4824 incrRefCount(c->argv[1]);
4825 } else {
4826 if (set->type != REDIS_SET) {
4827 addReply(c,shared.wrongtypeerr);
4828 return;
4829 }
4830 }
4831 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4832 incrRefCount(c->argv[2]);
4833 server.dirty++;
4834 addReply(c,shared.cone);
4835 } else {
4836 addReply(c,shared.czero);
4837 }
4838 }
4839
4840 static void sremCommand(redisClient *c) {
4841 robj *set;
4842
4843 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4844 checkType(c,set,REDIS_SET)) return;
4845
4846 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4847 server.dirty++;
4848 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4849 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4850 addReply(c,shared.cone);
4851 } else {
4852 addReply(c,shared.czero);
4853 }
4854 }
4855
4856 static void smoveCommand(redisClient *c) {
4857 robj *srcset, *dstset;
4858
4859 srcset = lookupKeyWrite(c->db,c->argv[1]);
4860 dstset = lookupKeyWrite(c->db,c->argv[2]);
4861
4862 /* If the source key does not exist return 0, if it's of the wrong type
4863 * raise an error */
4864 if (srcset == NULL || srcset->type != REDIS_SET) {
4865 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4866 return;
4867 }
4868 /* Error if the destination key is not a set as well */
4869 if (dstset && dstset->type != REDIS_SET) {
4870 addReply(c,shared.wrongtypeerr);
4871 return;
4872 }
4873 /* Remove the element from the source set */
4874 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4875 /* Key not found in the src set! return zero */
4876 addReply(c,shared.czero);
4877 return;
4878 }
4879 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4880 deleteKey(c->db,c->argv[1]);
4881 server.dirty++;
4882 /* Add the element to the destination set */
4883 if (!dstset) {
4884 dstset = createSetObject();
4885 dictAdd(c->db->dict,c->argv[2],dstset);
4886 incrRefCount(c->argv[2]);
4887 }
4888 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4889 incrRefCount(c->argv[3]);
4890 addReply(c,shared.cone);
4891 }
4892
4893 static void sismemberCommand(redisClient *c) {
4894 robj *set;
4895
4896 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4897 checkType(c,set,REDIS_SET)) return;
4898
4899 if (dictFind(set->ptr,c->argv[2]))
4900 addReply(c,shared.cone);
4901 else
4902 addReply(c,shared.czero);
4903 }
4904
4905 static void scardCommand(redisClient *c) {
4906 robj *o;
4907 dict *s;
4908
4909 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4910 checkType(c,o,REDIS_SET)) return;
4911
4912 s = o->ptr;
4913 addReplyUlong(c,dictSize(s));
4914 }
4915
4916 static void spopCommand(redisClient *c) {
4917 robj *set;
4918 dictEntry *de;
4919
4920 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4921 checkType(c,set,REDIS_SET)) return;
4922
4923 de = dictGetRandomKey(set->ptr);
4924 if (de == NULL) {
4925 addReply(c,shared.nullbulk);
4926 } else {
4927 robj *ele = dictGetEntryKey(de);
4928
4929 addReplyBulk(c,ele);
4930 dictDelete(set->ptr,ele);
4931 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4932 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4933 server.dirty++;
4934 }
4935 }
4936
4937 static void srandmemberCommand(redisClient *c) {
4938 robj *set;
4939 dictEntry *de;
4940
4941 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4942 checkType(c,set,REDIS_SET)) return;
4943
4944 de = dictGetRandomKey(set->ptr);
4945 if (de == NULL) {
4946 addReply(c,shared.nullbulk);
4947 } else {
4948 robj *ele = dictGetEntryKey(de);
4949
4950 addReplyBulk(c,ele);
4951 }
4952 }
4953
4954 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4955 dict **d1 = (void*) s1, **d2 = (void*) s2;
4956
4957 return dictSize(*d1)-dictSize(*d2);
4958 }
4959
4960 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4961 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4962 dictIterator *di;
4963 dictEntry *de;
4964 robj *lenobj = NULL, *dstset = NULL;
4965 unsigned long j, cardinality = 0;
4966
4967 for (j = 0; j < setsnum; j++) {
4968 robj *setobj;
4969
4970 setobj = dstkey ?
4971 lookupKeyWrite(c->db,setskeys[j]) :
4972 lookupKeyRead(c->db,setskeys[j]);
4973 if (!setobj) {
4974 zfree(dv);
4975 if (dstkey) {
4976 if (deleteKey(c->db,dstkey))
4977 server.dirty++;
4978 addReply(c,shared.czero);
4979 } else {
4980 addReply(c,shared.emptymultibulk);
4981 }
4982 return;
4983 }
4984 if (setobj->type != REDIS_SET) {
4985 zfree(dv);
4986 addReply(c,shared.wrongtypeerr);
4987 return;
4988 }
4989 dv[j] = setobj->ptr;
4990 }
4991 /* Sort sets from the smallest to largest, this will improve our
4992 * algorithm's performace */
4993 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4994
4995 /* The first thing we should output is the total number of elements...
4996 * since this is a multi-bulk write, but at this stage we don't know
4997 * the intersection set size, so we use a trick, append an empty object
4998 * to the output list and save the pointer to later modify it with the
4999 * right length */
5000 if (!dstkey) {
5001 lenobj = createObject(REDIS_STRING,NULL);
5002 addReply(c,lenobj);
5003 decrRefCount(lenobj);
5004 } else {
5005 /* If we have a target key where to store the resulting set
5006 * create this key with an empty set inside */
5007 dstset = createSetObject();
5008 }
5009
5010 /* Iterate all the elements of the first (smallest) set, and test
5011 * the element against all the other sets, if at least one set does
5012 * not include the element it is discarded */
5013 di = dictGetIterator(dv[0]);
5014
5015 while((de = dictNext(di)) != NULL) {
5016 robj *ele;
5017
5018 for (j = 1; j < setsnum; j++)
5019 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5020 if (j != setsnum)
5021 continue; /* at least one set does not contain the member */
5022 ele = dictGetEntryKey(de);
5023 if (!dstkey) {
5024 addReplyBulk(c,ele);
5025 cardinality++;
5026 } else {
5027 dictAdd(dstset->ptr,ele,NULL);
5028 incrRefCount(ele);
5029 }
5030 }
5031 dictReleaseIterator(di);
5032
5033 if (dstkey) {
5034 /* Store the resulting set into the target, if the intersection
5035 * is not an empty set. */
5036 deleteKey(c->db,dstkey);
5037 if (dictSize((dict*)dstset->ptr) > 0) {
5038 dictAdd(c->db->dict,dstkey,dstset);
5039 incrRefCount(dstkey);
5040 addReplyLong(c,dictSize((dict*)dstset->ptr));
5041 } else {
5042 decrRefCount(dstset);
5043 addReply(c,shared.czero);
5044 }
5045 server.dirty++;
5046 } else {
5047 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5048 }
5049 zfree(dv);
5050 }
5051
5052 static void sinterCommand(redisClient *c) {
5053 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5054 }
5055
5056 static void sinterstoreCommand(redisClient *c) {
5057 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5058 }
5059
5060 #define REDIS_OP_UNION 0
5061 #define REDIS_OP_DIFF 1
5062 #define REDIS_OP_INTER 2
5063
5064 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5065 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5066 dictIterator *di;
5067 dictEntry *de;
5068 robj *dstset = NULL;
5069 int j, cardinality = 0;
5070
5071 for (j = 0; j < setsnum; j++) {
5072 robj *setobj;
5073
5074 setobj = dstkey ?
5075 lookupKeyWrite(c->db,setskeys[j]) :
5076 lookupKeyRead(c->db,setskeys[j]);
5077 if (!setobj) {
5078 dv[j] = NULL;
5079 continue;
5080 }
5081 if (setobj->type != REDIS_SET) {
5082 zfree(dv);
5083 addReply(c,shared.wrongtypeerr);
5084 return;
5085 }
5086 dv[j] = setobj->ptr;
5087 }
5088
5089 /* We need a temp set object to store our union. If the dstkey
5090 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5091 * this set object will be the resulting object to set into the target key*/
5092 dstset = createSetObject();
5093
5094 /* Iterate all the elements of all the sets, add every element a single
5095 * time to the result set */
5096 for (j = 0; j < setsnum; j++) {
5097 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5098 if (!dv[j]) continue; /* non existing keys are like empty sets */
5099
5100 di = dictGetIterator(dv[j]);
5101
5102 while((de = dictNext(di)) != NULL) {
5103 robj *ele;
5104
5105 /* dictAdd will not add the same element multiple times */
5106 ele = dictGetEntryKey(de);
5107 if (op == REDIS_OP_UNION || j == 0) {
5108 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5109 incrRefCount(ele);
5110 cardinality++;
5111 }
5112 } else if (op == REDIS_OP_DIFF) {
5113 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5114 cardinality--;
5115 }
5116 }
5117 }
5118 dictReleaseIterator(di);
5119
5120 /* result set is empty? Exit asap. */
5121 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5122 }
5123
5124 /* Output the content of the resulting set, if not in STORE mode */
5125 if (!dstkey) {
5126 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5127 di = dictGetIterator(dstset->ptr);
5128 while((de = dictNext(di)) != NULL) {
5129 robj *ele;
5130
5131 ele = dictGetEntryKey(de);
5132 addReplyBulk(c,ele);
5133 }
5134 dictReleaseIterator(di);
5135 decrRefCount(dstset);
5136 } else {
5137 /* If we have a target key where to store the resulting set
5138 * create this key with the result set inside */
5139 deleteKey(c->db,dstkey);
5140 if (dictSize((dict*)dstset->ptr) > 0) {
5141 dictAdd(c->db->dict,dstkey,dstset);
5142 incrRefCount(dstkey);
5143 addReplyLong(c,dictSize((dict*)dstset->ptr));
5144 } else {
5145 decrRefCount(dstset);
5146 addReply(c,shared.czero);
5147 }
5148 server.dirty++;
5149 }
5150 zfree(dv);
5151 }
5152
5153 static void sunionCommand(redisClient *c) {
5154 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5155 }
5156
5157 static void sunionstoreCommand(redisClient *c) {
5158 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5159 }
5160
5161 static void sdiffCommand(redisClient *c) {
5162 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5163 }
5164
5165 static void sdiffstoreCommand(redisClient *c) {
5166 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5167 }
5168
5169 /* ==================================== ZSets =============================== */
5170
5171 /* ZSETs are ordered sets using two data structures to hold the same elements
5172 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5173 * data structure.
5174 *
5175 * The elements are added to an hash table mapping Redis objects to scores.
5176 * At the same time the elements are added to a skip list mapping scores
5177 * to Redis objects (so objects are sorted by scores in this "view"). */
5178
5179 /* This skiplist implementation is almost a C translation of the original
5180 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5181 * Alternative to Balanced Trees", modified in three ways:
5182 * a) this implementation allows for repeated values.
5183 * b) the comparison is not just by key (our 'score') but by satellite data.
5184 * c) there is a back pointer, so it's a doubly linked list with the back
5185 * pointers being only at "level 1". This allows to traverse the list
5186 * from tail to head, useful for ZREVRANGE. */
5187
5188 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5189 zskiplistNode *zn = zmalloc(sizeof(*zn));
5190
5191 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5192 if (level > 0)
5193 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5194 zn->score = score;
5195 zn->obj = obj;
5196 return zn;
5197 }
5198
5199 static zskiplist *zslCreate(void) {
5200 int j;
5201 zskiplist *zsl;
5202
5203 zsl = zmalloc(sizeof(*zsl));
5204 zsl->level = 1;
5205 zsl->length = 0;
5206 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5207 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5208 zsl->header->forward[j] = NULL;
5209
5210 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5211 if (j < ZSKIPLIST_MAXLEVEL-1)
5212 zsl->header->span[j] = 0;
5213 }
5214 zsl->header->backward = NULL;
5215 zsl->tail = NULL;
5216 return zsl;
5217 }
5218
5219 static void zslFreeNode(zskiplistNode *node) {
5220 decrRefCount(node->obj);
5221 zfree(node->forward);
5222 zfree(node->span);
5223 zfree(node);
5224 }
5225
5226 static void zslFree(zskiplist *zsl) {
5227 zskiplistNode *node = zsl->header->forward[0], *next;
5228
5229 zfree(zsl->header->forward);
5230 zfree(zsl->header->span);
5231 zfree(zsl->header);
5232 while(node) {
5233 next = node->forward[0];
5234 zslFreeNode(node);
5235 node = next;
5236 }
5237 zfree(zsl);
5238 }
5239
5240 static int zslRandomLevel(void) {
5241 int level = 1;
5242 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5243 level += 1;
5244 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5245 }
5246
5247 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5248 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5249 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5250 int i, level;
5251
5252 x = zsl->header;
5253 for (i = zsl->level-1; i >= 0; i--) {
5254 /* store rank that is crossed to reach the insert position */
5255 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5256
5257 while (x->forward[i] &&
5258 (x->forward[i]->score < score ||
5259 (x->forward[i]->score == score &&
5260 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5261 rank[i] += i > 0 ? x->span[i-1] : 1;
5262 x = x->forward[i];
5263 }
5264 update[i] = x;
5265 }
5266 /* we assume the key is not already inside, since we allow duplicated
5267 * scores, and the re-insertion of score and redis object should never
5268 * happpen since the caller of zslInsert() should test in the hash table
5269 * if the element is already inside or not. */
5270 level = zslRandomLevel();
5271 if (level > zsl->level) {
5272 for (i = zsl->level; i < level; i++) {
5273 rank[i] = 0;
5274 update[i] = zsl->header;
5275 update[i]->span[i-1] = zsl->length;
5276 }
5277 zsl->level = level;
5278 }
5279 x = zslCreateNode(level,score,obj);
5280 for (i = 0; i < level; i++) {
5281 x->forward[i] = update[i]->forward[i];
5282 update[i]->forward[i] = x;
5283
5284 /* update span covered by update[i] as x is inserted here */
5285 if (i > 0) {
5286 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5287 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5288 }
5289 }
5290
5291 /* increment span for untouched levels */
5292 for (i = level; i < zsl->level; i++) {
5293 update[i]->span[i-1]++;
5294 }
5295
5296 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5297 if (x->forward[0])
5298 x->forward[0]->backward = x;
5299 else
5300 zsl->tail = x;
5301 zsl->length++;
5302 }
5303
5304 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5305 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5306 int i;
5307 for (i = 0; i < zsl->level; i++) {
5308 if (update[i]->forward[i] == x) {
5309 if (i > 0) {
5310 update[i]->span[i-1] += x->span[i-1] - 1;
5311 }
5312 update[i]->forward[i] = x->forward[i];
5313 } else {
5314 /* invariant: i > 0, because update[0]->forward[0]
5315 * is always equal to x */
5316 update[i]->span[i-1] -= 1;
5317 }
5318 }
5319 if (x->forward[0]) {
5320 x->forward[0]->backward = x->backward;
5321 } else {
5322 zsl->tail = x->backward;
5323 }
5324 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5325 zsl->level--;
5326 zsl->length--;
5327 }
5328
5329 /* Delete an element with matching score/object from the skiplist. */
5330 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5331 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5332 int i;
5333
5334 x = zsl->header;
5335 for (i = zsl->level-1; i >= 0; i--) {
5336 while (x->forward[i] &&
5337 (x->forward[i]->score < score ||
5338 (x->forward[i]->score == score &&
5339 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5340 x = x->forward[i];
5341 update[i] = x;
5342 }
5343 /* We may have multiple elements with the same score, what we need
5344 * is to find the element with both the right score and object. */
5345 x = x->forward[0];
5346 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5347 zslDeleteNode(zsl, x, update);
5348 zslFreeNode(x);
5349 return 1;
5350 } else {
5351 return 0; /* not found */
5352 }
5353 return 0; /* not found */
5354 }
5355
5356 /* Delete all the elements with score between min and max from the skiplist.
5357 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5358 * Note that this function takes the reference to the hash table view of the
5359 * sorted set, in order to remove the elements from the hash table too. */
5360 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5361 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5362 unsigned long removed = 0;
5363 int i;
5364
5365 x = zsl->header;
5366 for (i = zsl->level-1; i >= 0; i--) {
5367 while (x->forward[i] && x->forward[i]->score < min)
5368 x = x->forward[i];
5369 update[i] = x;
5370 }
5371 /* We may have multiple elements with the same score, what we need
5372 * is to find the element with both the right score and object. */
5373 x = x->forward[0];
5374 while (x && x->score <= max) {
5375 zskiplistNode *next = x->forward[0];
5376 zslDeleteNode(zsl, x, update);
5377 dictDelete(dict,x->obj);
5378 zslFreeNode(x);
5379 removed++;
5380 x = next;
5381 }
5382 return removed; /* not found */
5383 }
5384
5385 /* Delete all the elements with rank between start and end from the skiplist.
5386 * Start and end are inclusive. Note that start and end need to be 1-based */
5387 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5388 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5389 unsigned long traversed = 0, removed = 0;
5390 int i;
5391
5392 x = zsl->header;
5393 for (i = zsl->level-1; i >= 0; i--) {
5394 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5395 traversed += i > 0 ? x->span[i-1] : 1;
5396 x = x->forward[i];
5397 }
5398 update[i] = x;
5399 }
5400
5401 traversed++;
5402 x = x->forward[0];
5403 while (x && traversed <= end) {
5404 zskiplistNode *next = x->forward[0];
5405 zslDeleteNode(zsl, x, update);
5406 dictDelete(dict,x->obj);
5407 zslFreeNode(x);
5408 removed++;
5409 traversed++;
5410 x = next;
5411 }
5412 return removed;
5413 }
5414
5415 /* Find the first node having a score equal or greater than the specified one.
5416 * Returns NULL if there is no match. */
5417 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5418 zskiplistNode *x;
5419 int i;
5420
5421 x = zsl->header;
5422 for (i = zsl->level-1; i >= 0; i--) {
5423 while (x->forward[i] && x->forward[i]->score < score)
5424 x = x->forward[i];
5425 }
5426 /* We may have multiple elements with the same score, what we need
5427 * is to find the element with both the right score and object. */
5428 return x->forward[0];
5429 }
5430
5431 /* Find the rank for an element by both score and key.
5432 * Returns 0 when the element cannot be found, rank otherwise.
5433 * Note that the rank is 1-based due to the span of zsl->header to the
5434 * first element. */
5435 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5436 zskiplistNode *x;
5437 unsigned long rank = 0;
5438 int i;
5439
5440 x = zsl->header;
5441 for (i = zsl->level-1; i >= 0; i--) {
5442 while (x->forward[i] &&
5443 (x->forward[i]->score < score ||
5444 (x->forward[i]->score == score &&
5445 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5446 rank += i > 0 ? x->span[i-1] : 1;
5447 x = x->forward[i];
5448 }
5449
5450 /* x might be equal to zsl->header, so test if obj is non-NULL */
5451 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5452 return rank;
5453 }
5454 }
5455 return 0;
5456 }
5457
5458 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5459 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5460 zskiplistNode *x;
5461 unsigned long traversed = 0;
5462 int i;
5463
5464 x = zsl->header;
5465 for (i = zsl->level-1; i >= 0; i--) {
5466 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5467 {
5468 traversed += i > 0 ? x->span[i-1] : 1;
5469 x = x->forward[i];
5470 }
5471 if (traversed == rank) {
5472 return x;
5473 }
5474 }
5475 return NULL;
5476 }
5477
5478 /* The actual Z-commands implementations */
5479
5480 /* This generic command implements both ZADD and ZINCRBY.
5481 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5482 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5483 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5484 robj *zsetobj;
5485 zset *zs;
5486 double *score;
5487
5488 zsetobj = lookupKeyWrite(c->db,key);
5489 if (zsetobj == NULL) {
5490 zsetobj = createZsetObject();
5491 dictAdd(c->db->dict,key,zsetobj);
5492 incrRefCount(key);
5493 } else {
5494 if (zsetobj->type != REDIS_ZSET) {
5495 addReply(c,shared.wrongtypeerr);
5496 return;
5497 }
5498 }
5499 zs = zsetobj->ptr;
5500
5501 /* Ok now since we implement both ZADD and ZINCRBY here the code
5502 * needs to handle the two different conditions. It's all about setting
5503 * '*score', that is, the new score to set, to the right value. */
5504 score = zmalloc(sizeof(double));
5505 if (doincrement) {
5506 dictEntry *de;
5507
5508 /* Read the old score. If the element was not present starts from 0 */
5509 de = dictFind(zs->dict,ele);
5510 if (de) {
5511 double *oldscore = dictGetEntryVal(de);
5512 *score = *oldscore + scoreval;
5513 } else {
5514 *score = scoreval;
5515 }
5516 } else {
5517 *score = scoreval;
5518 }
5519
5520 /* What follows is a simple remove and re-insert operation that is common
5521 * to both ZADD and ZINCRBY... */
5522 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5523 /* case 1: New element */
5524 incrRefCount(ele); /* added to hash */
5525 zslInsert(zs->zsl,*score,ele);
5526 incrRefCount(ele); /* added to skiplist */
5527 server.dirty++;
5528 if (doincrement)
5529 addReplyDouble(c,*score);
5530 else
5531 addReply(c,shared.cone);
5532 } else {
5533 dictEntry *de;
5534 double *oldscore;
5535
5536 /* case 2: Score update operation */
5537 de = dictFind(zs->dict,ele);
5538 redisAssert(de != NULL);
5539 oldscore = dictGetEntryVal(de);
5540 if (*score != *oldscore) {
5541 int deleted;
5542
5543 /* Remove and insert the element in the skip list with new score */
5544 deleted = zslDelete(zs->zsl,*oldscore,ele);
5545 redisAssert(deleted != 0);
5546 zslInsert(zs->zsl,*score,ele);
5547 incrRefCount(ele);
5548 /* Update the score in the hash table */
5549 dictReplace(zs->dict,ele,score);
5550 server.dirty++;
5551 } else {
5552 zfree(score);
5553 }
5554 if (doincrement)
5555 addReplyDouble(c,*score);
5556 else
5557 addReply(c,shared.czero);
5558 }
5559 }
5560
5561 static void zaddCommand(redisClient *c) {
5562 double scoreval;
5563
5564 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5565 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5566 }
5567
5568 static void zincrbyCommand(redisClient *c) {
5569 double scoreval;
5570
5571 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5572 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5573 }
5574
5575 static void zremCommand(redisClient *c) {
5576 robj *zsetobj;
5577 zset *zs;
5578 dictEntry *de;
5579 double *oldscore;
5580 int deleted;
5581
5582 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5583 checkType(c,zsetobj,REDIS_ZSET)) return;
5584
5585 zs = zsetobj->ptr;
5586 de = dictFind(zs->dict,c->argv[2]);
5587 if (de == NULL) {
5588 addReply(c,shared.czero);
5589 return;
5590 }
5591 /* Delete from the skiplist */
5592 oldscore = dictGetEntryVal(de);
5593 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5594 redisAssert(deleted != 0);
5595
5596 /* Delete from the hash table */
5597 dictDelete(zs->dict,c->argv[2]);
5598 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5599 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5600 server.dirty++;
5601 addReply(c,shared.cone);
5602 }
5603
5604 static void zremrangebyscoreCommand(redisClient *c) {
5605 double min;
5606 double max;
5607 long deleted;
5608 robj *zsetobj;
5609 zset *zs;
5610
5611 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5612 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5613
5614 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5615 checkType(c,zsetobj,REDIS_ZSET)) return;
5616
5617 zs = zsetobj->ptr;
5618 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5619 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5620 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5621 server.dirty += deleted;
5622 addReplyLong(c,deleted);
5623 }
5624
5625 static void zremrangebyrankCommand(redisClient *c) {
5626 long start;
5627 long end;
5628 int llen;
5629 long deleted;
5630 robj *zsetobj;
5631 zset *zs;
5632
5633 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5634 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5635
5636 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5637 checkType(c,zsetobj,REDIS_ZSET)) return;
5638 zs = zsetobj->ptr;
5639 llen = zs->zsl->length;
5640
5641 /* convert negative indexes */
5642 if (start < 0) start = llen+start;
5643 if (end < 0) end = llen+end;
5644 if (start < 0) start = 0;
5645 if (end < 0) end = 0;
5646
5647 /* indexes sanity checks */
5648 if (start > end || start >= llen) {
5649 addReply(c,shared.czero);
5650 return;
5651 }
5652 if (end >= llen) end = llen-1;
5653
5654 /* increment start and end because zsl*Rank functions
5655 * use 1-based rank */
5656 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5657 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5658 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5659 server.dirty += deleted;
5660 addReplyLong(c, deleted);
5661 }
5662
5663 typedef struct {
5664 dict *dict;
5665 double weight;
5666 } zsetopsrc;
5667
5668 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5669 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5670 unsigned long size1, size2;
5671 size1 = d1->dict ? dictSize(d1->dict) : 0;
5672 size2 = d2->dict ? dictSize(d2->dict) : 0;
5673 return size1 - size2;
5674 }
5675
5676 #define REDIS_AGGR_SUM 1
5677 #define REDIS_AGGR_MIN 2
5678 #define REDIS_AGGR_MAX 3
5679
5680 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5681 if (aggregate == REDIS_AGGR_SUM) {
5682 *target = *target + val;
5683 } else if (aggregate == REDIS_AGGR_MIN) {
5684 *target = val < *target ? val : *target;
5685 } else if (aggregate == REDIS_AGGR_MAX) {
5686 *target = val > *target ? val : *target;
5687 } else {
5688 /* safety net */
5689 redisPanic("Unknown ZUNION/INTER aggregate type");
5690 }
5691 }
5692
5693 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5694 int i, j, zsetnum;
5695 int aggregate = REDIS_AGGR_SUM;
5696 zsetopsrc *src;
5697 robj *dstobj;
5698 zset *dstzset;
5699 dictIterator *di;
5700 dictEntry *de;
5701
5702 /* expect zsetnum input keys to be given */
5703 zsetnum = atoi(c->argv[2]->ptr);
5704 if (zsetnum < 1) {
5705 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5706 return;
5707 }
5708
5709 /* test if the expected number of keys would overflow */
5710 if (3+zsetnum > c->argc) {
5711 addReply(c,shared.syntaxerr);
5712 return;
5713 }
5714
5715 /* read keys to be used for input */
5716 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5717 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5718 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5719 if (!zsetobj) {
5720 src[i].dict = NULL;
5721 } else {
5722 if (zsetobj->type != REDIS_ZSET) {
5723 zfree(src);
5724 addReply(c,shared.wrongtypeerr);
5725 return;
5726 }
5727 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5728 }
5729
5730 /* default all weights to 1 */
5731 src[i].weight = 1.0;
5732 }
5733
5734 /* parse optional extra arguments */
5735 if (j < c->argc) {
5736 int remaining = c->argc - j;
5737
5738 while (remaining) {
5739 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5740 j++; remaining--;
5741 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5742 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5743 return;
5744 }
5745 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5746 j++; remaining--;
5747 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5748 aggregate = REDIS_AGGR_SUM;
5749 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5750 aggregate = REDIS_AGGR_MIN;
5751 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5752 aggregate = REDIS_AGGR_MAX;
5753 } else {
5754 zfree(src);
5755 addReply(c,shared.syntaxerr);
5756 return;
5757 }
5758 j++; remaining--;
5759 } else {
5760 zfree(src);
5761 addReply(c,shared.syntaxerr);
5762 return;
5763 }
5764 }
5765 }
5766
5767 /* sort sets from the smallest to largest, this will improve our
5768 * algorithm's performance */
5769 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5770
5771 dstobj = createZsetObject();
5772 dstzset = dstobj->ptr;
5773
5774 if (op == REDIS_OP_INTER) {
5775 /* skip going over all entries if the smallest zset is NULL or empty */
5776 if (src[0].dict && dictSize(src[0].dict) > 0) {
5777 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5778 * from small to large, all src[i > 0].dict are non-empty too */
5779 di = dictGetIterator(src[0].dict);
5780 while((de = dictNext(di)) != NULL) {
5781 double *score = zmalloc(sizeof(double)), value;
5782 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5783
5784 for (j = 1; j < zsetnum; j++) {
5785 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5786 if (other) {
5787 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5788 zunionInterAggregate(score, value, aggregate);
5789 } else {
5790 break;
5791 }
5792 }
5793
5794 /* skip entry when not present in every source dict */
5795 if (j != zsetnum) {
5796 zfree(score);
5797 } else {
5798 robj *o = dictGetEntryKey(de);
5799 dictAdd(dstzset->dict,o,score);
5800 incrRefCount(o); /* added to dictionary */
5801 zslInsert(dstzset->zsl,*score,o);
5802 incrRefCount(o); /* added to skiplist */
5803 }
5804 }
5805 dictReleaseIterator(di);
5806 }
5807 } else if (op == REDIS_OP_UNION) {
5808 for (i = 0; i < zsetnum; i++) {
5809 if (!src[i].dict) continue;
5810
5811 di = dictGetIterator(src[i].dict);
5812 while((de = dictNext(di)) != NULL) {
5813 /* skip key when already processed */
5814 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5815
5816 double *score = zmalloc(sizeof(double)), value;
5817 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5818
5819 /* because the zsets are sorted by size, its only possible
5820 * for sets at larger indices to hold this entry */
5821 for (j = (i+1); j < zsetnum; j++) {
5822 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5823 if (other) {
5824 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5825 zunionInterAggregate(score, value, aggregate);
5826 }
5827 }
5828
5829 robj *o = dictGetEntryKey(de);
5830 dictAdd(dstzset->dict,o,score);
5831 incrRefCount(o); /* added to dictionary */
5832 zslInsert(dstzset->zsl,*score,o);
5833 incrRefCount(o); /* added to skiplist */
5834 }
5835 dictReleaseIterator(di);
5836 }
5837 } else {
5838 /* unknown operator */
5839 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5840 }
5841
5842 deleteKey(c->db,dstkey);
5843 if (dstzset->zsl->length) {
5844 dictAdd(c->db->dict,dstkey,dstobj);
5845 incrRefCount(dstkey);
5846 addReplyLong(c, dstzset->zsl->length);
5847 server.dirty++;
5848 } else {
5849 decrRefCount(dstobj);
5850 addReply(c, shared.czero);
5851 }
5852 zfree(src);
5853 }
5854
5855 static void zunionCommand(redisClient *c) {
5856 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5857 }
5858
5859 static void zinterCommand(redisClient *c) {
5860 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5861 }
5862
5863 static void zrangeGenericCommand(redisClient *c, int reverse) {
5864 robj *o;
5865 long start;
5866 long end;
5867 int withscores = 0;
5868 int llen;
5869 int rangelen, j;
5870 zset *zsetobj;
5871 zskiplist *zsl;
5872 zskiplistNode *ln;
5873 robj *ele;
5874
5875 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5876 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5877
5878 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5879 withscores = 1;
5880 } else if (c->argc >= 5) {
5881 addReply(c,shared.syntaxerr);
5882 return;
5883 }
5884
5885 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5886 || checkType(c,o,REDIS_ZSET)) return;
5887 zsetobj = o->ptr;
5888 zsl = zsetobj->zsl;
5889 llen = zsl->length;
5890
5891 /* convert negative indexes */
5892 if (start < 0) start = llen+start;
5893 if (end < 0) end = llen+end;
5894 if (start < 0) start = 0;
5895 if (end < 0) end = 0;
5896
5897 /* indexes sanity checks */
5898 if (start > end || start >= llen) {
5899 /* Out of range start or start > end result in empty list */
5900 addReply(c,shared.emptymultibulk);
5901 return;
5902 }
5903 if (end >= llen) end = llen-1;
5904 rangelen = (end-start)+1;
5905
5906 /* check if starting point is trivial, before searching
5907 * the element in log(N) time */
5908 if (reverse) {
5909 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5910 } else {
5911 ln = start == 0 ?
5912 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5913 }
5914
5915 /* Return the result in form of a multi-bulk reply */
5916 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5917 withscores ? (rangelen*2) : rangelen));
5918 for (j = 0; j < rangelen; j++) {
5919 ele = ln->obj;
5920 addReplyBulk(c,ele);
5921 if (withscores)
5922 addReplyDouble(c,ln->score);
5923 ln = reverse ? ln->backward : ln->forward[0];
5924 }
5925 }
5926
5927 static void zrangeCommand(redisClient *c) {
5928 zrangeGenericCommand(c,0);
5929 }
5930
5931 static void zrevrangeCommand(redisClient *c) {
5932 zrangeGenericCommand(c,1);
5933 }
5934
5935 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5936 * If justcount is non-zero, just the count is returned. */
5937 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5938 robj *o;
5939 double min, max;
5940 int minex = 0, maxex = 0; /* are min or max exclusive? */
5941 int offset = 0, limit = -1;
5942 int withscores = 0;
5943 int badsyntax = 0;
5944
5945 /* Parse the min-max interval. If one of the values is prefixed
5946 * by the "(" character, it's considered "open". For instance
5947 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5948 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5949 if (((char*)c->argv[2]->ptr)[0] == '(') {
5950 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5951 minex = 1;
5952 } else {
5953 min = strtod(c->argv[2]->ptr,NULL);
5954 }
5955 if (((char*)c->argv[3]->ptr)[0] == '(') {
5956 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5957 maxex = 1;
5958 } else {
5959 max = strtod(c->argv[3]->ptr,NULL);
5960 }
5961
5962 /* Parse "WITHSCORES": note that if the command was called with
5963 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5964 * enter the following paths to parse WITHSCORES and LIMIT. */
5965 if (c->argc == 5 || c->argc == 8) {
5966 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5967 withscores = 1;
5968 else
5969 badsyntax = 1;
5970 }
5971 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5972 badsyntax = 1;
5973 if (badsyntax) {
5974 addReplySds(c,
5975 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5976 return;
5977 }
5978
5979 /* Parse "LIMIT" */
5980 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5981 addReply(c,shared.syntaxerr);
5982 return;
5983 } else if (c->argc == (7 + withscores)) {
5984 offset = atoi(c->argv[5]->ptr);
5985 limit = atoi(c->argv[6]->ptr);
5986 if (offset < 0) offset = 0;
5987 }
5988
5989 /* Ok, lookup the key and get the range */
5990 o = lookupKeyRead(c->db,c->argv[1]);
5991 if (o == NULL) {
5992 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5993 } else {
5994 if (o->type != REDIS_ZSET) {
5995 addReply(c,shared.wrongtypeerr);
5996 } else {
5997 zset *zsetobj = o->ptr;
5998 zskiplist *zsl = zsetobj->zsl;
5999 zskiplistNode *ln;
6000 robj *ele, *lenobj = NULL;
6001 unsigned long rangelen = 0;
6002
6003 /* Get the first node with the score >= min, or with
6004 * score > min if 'minex' is true. */
6005 ln = zslFirstWithScore(zsl,min);
6006 while (minex && ln && ln->score == min) ln = ln->forward[0];
6007
6008 if (ln == NULL) {
6009 /* No element matching the speciifed interval */
6010 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
6011 return;
6012 }
6013
6014 /* We don't know in advance how many matching elements there
6015 * are in the list, so we push this object that will represent
6016 * the multi-bulk length in the output buffer, and will "fix"
6017 * it later */
6018 if (!justcount) {
6019 lenobj = createObject(REDIS_STRING,NULL);
6020 addReply(c,lenobj);
6021 decrRefCount(lenobj);
6022 }
6023
6024 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
6025 if (offset) {
6026 offset--;
6027 ln = ln->forward[0];
6028 continue;
6029 }
6030 if (limit == 0) break;
6031 if (!justcount) {
6032 ele = ln->obj;
6033 addReplyBulk(c,ele);
6034 if (withscores)
6035 addReplyDouble(c,ln->score);
6036 }
6037 ln = ln->forward[0];
6038 rangelen++;
6039 if (limit > 0) limit--;
6040 }
6041 if (justcount) {
6042 addReplyLong(c,(long)rangelen);
6043 } else {
6044 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6045 withscores ? (rangelen*2) : rangelen);
6046 }
6047 }
6048 }
6049 }
6050
6051 static void zrangebyscoreCommand(redisClient *c) {
6052 genericZrangebyscoreCommand(c,0);
6053 }
6054
6055 static void zcountCommand(redisClient *c) {
6056 genericZrangebyscoreCommand(c,1);
6057 }
6058
6059 static void zcardCommand(redisClient *c) {
6060 robj *o;
6061 zset *zs;
6062
6063 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6064 checkType(c,o,REDIS_ZSET)) return;
6065
6066 zs = o->ptr;
6067 addReplyUlong(c,zs->zsl->length);
6068 }
6069
6070 static void zscoreCommand(redisClient *c) {
6071 robj *o;
6072 zset *zs;
6073 dictEntry *de;
6074
6075 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6076 checkType(c,o,REDIS_ZSET)) return;
6077
6078 zs = o->ptr;
6079 de = dictFind(zs->dict,c->argv[2]);
6080 if (!de) {
6081 addReply(c,shared.nullbulk);
6082 } else {
6083 double *score = dictGetEntryVal(de);
6084
6085 addReplyDouble(c,*score);
6086 }
6087 }
6088
6089 static void zrankGenericCommand(redisClient *c, int reverse) {
6090 robj *o;
6091 zset *zs;
6092 zskiplist *zsl;
6093 dictEntry *de;
6094 unsigned long rank;
6095 double *score;
6096
6097 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6098 checkType(c,o,REDIS_ZSET)) return;
6099
6100 zs = o->ptr;
6101 zsl = zs->zsl;
6102 de = dictFind(zs->dict,c->argv[2]);
6103 if (!de) {
6104 addReply(c,shared.nullbulk);
6105 return;
6106 }
6107
6108 score = dictGetEntryVal(de);
6109 rank = zslGetRank(zsl, *score, c->argv[2]);
6110 if (rank) {
6111 if (reverse) {
6112 addReplyLong(c, zsl->length - rank);
6113 } else {
6114 addReplyLong(c, rank-1);
6115 }
6116 } else {
6117 addReply(c,shared.nullbulk);
6118 }
6119 }
6120
6121 static void zrankCommand(redisClient *c) {
6122 zrankGenericCommand(c, 0);
6123 }
6124
6125 static void zrevrankCommand(redisClient *c) {
6126 zrankGenericCommand(c, 1);
6127 }
6128
6129 /* ========================= Hashes utility functions ======================= */
6130 #define REDIS_HASH_KEY 1
6131 #define REDIS_HASH_VALUE 2
6132
6133 /* Check the length of a number of objects to see if we need to convert a
6134 * zipmap to a real hash. Note that we only check string encoded objects
6135 * as their string length can be queried in constant time. */
6136 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6137 int i;
6138 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6139
6140 for (i = start; i <= end; i++) {
6141 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6142 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6143 {
6144 convertToRealHash(subject);
6145 return;
6146 }
6147 }
6148 }
6149
6150 /* Encode given objects in-place when the hash uses a dict. */
6151 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6152 if (subject->encoding == REDIS_ENCODING_HT) {
6153 if (o1) *o1 = tryObjectEncoding(*o1);
6154 if (o2) *o2 = tryObjectEncoding(*o2);
6155 }
6156 }
6157
6158 /* Get the value from a hash identified by key. Returns either a string
6159 * object or NULL if the value cannot be found. The refcount of the object
6160 * is always increased by 1 when the value was found. */
6161 static robj *hashGet(robj *o, robj *key) {
6162 robj *value = NULL;
6163 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6164 unsigned char *v;
6165 unsigned int vlen;
6166 key = getDecodedObject(key);
6167 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6168 value = createStringObject((char*)v,vlen);
6169 }
6170 decrRefCount(key);
6171 } else {
6172 dictEntry *de = dictFind(o->ptr,key);
6173 if (de != NULL) {
6174 value = dictGetEntryVal(de);
6175 incrRefCount(value);
6176 }
6177 }
6178 return value;
6179 }
6180
6181 /* Test if the key exists in the given hash. Returns 1 if the key
6182 * exists and 0 when it doesn't. */
6183 static int hashExists(robj *o, robj *key) {
6184 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6185 key = getDecodedObject(key);
6186 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6187 decrRefCount(key);
6188 return 1;
6189 }
6190 decrRefCount(key);
6191 } else {
6192 if (dictFind(o->ptr,key) != NULL) {
6193 return 1;
6194 }
6195 }
6196 return 0;
6197 }
6198
6199 /* Add an element, discard the old if the key already exists.
6200 * Return 0 on insert and 1 on update. */
6201 static int hashSet(robj *o, robj *key, robj *value) {
6202 int update = 0;
6203 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6204 key = getDecodedObject(key);
6205 value = getDecodedObject(value);
6206 o->ptr = zipmapSet(o->ptr,
6207 key->ptr,sdslen(key->ptr),
6208 value->ptr,sdslen(value->ptr), &update);
6209 decrRefCount(key);
6210 decrRefCount(value);
6211
6212 /* Check if the zipmap needs to be upgraded to a real hash table */
6213 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6214 convertToRealHash(o);
6215 } else {
6216 if (dictReplace(o->ptr,key,value)) {
6217 /* Insert */
6218 incrRefCount(key);
6219 } else {
6220 /* Update */
6221 update = 1;
6222 }
6223 incrRefCount(value);
6224 }
6225 return update;
6226 }
6227
6228 /* Delete an element from a hash.
6229 * Return 1 on deleted and 0 on not found. */
6230 static int hashDelete(robj *o, robj *key) {
6231 int deleted = 0;
6232 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6233 key = getDecodedObject(key);
6234 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6235 decrRefCount(key);
6236 } else {
6237 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6238 /* Always check if the dictionary needs a resize after a delete. */
6239 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6240 }
6241 return deleted;
6242 }
6243
6244 /* Return the number of elements in a hash. */
6245 static unsigned long hashLength(robj *o) {
6246 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6247 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6248 }
6249
6250 /* Structure to hold hash iteration abstration. Note that iteration over
6251 * hashes involves both fields and values. Because it is possible that
6252 * not both are required, store pointers in the iterator to avoid
6253 * unnecessary memory allocation for fields/values. */
6254 typedef struct {
6255 int encoding;
6256 unsigned char *zi;
6257 unsigned char *zk, *zv;
6258 unsigned int zklen, zvlen;
6259
6260 dictIterator *di;
6261 dictEntry *de;
6262 } hashIterator;
6263
6264 static hashIterator *hashInitIterator(robj *subject) {
6265 hashIterator *hi = zmalloc(sizeof(hashIterator));
6266 hi->encoding = subject->encoding;
6267 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6268 hi->zi = zipmapRewind(subject->ptr);
6269 } else if (hi->encoding == REDIS_ENCODING_HT) {
6270 hi->di = dictGetIterator(subject->ptr);
6271 } else {
6272 redisAssert(NULL);
6273 }
6274 return hi;
6275 }
6276
6277 static void hashReleaseIterator(hashIterator *hi) {
6278 if (hi->encoding == REDIS_ENCODING_HT) {
6279 dictReleaseIterator(hi->di);
6280 }
6281 zfree(hi);
6282 }
6283
6284 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6285 * could be found and REDIS_ERR when the iterator reaches the end. */
6286 static int hashNext(hashIterator *hi) {
6287 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6288 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6289 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6290 } else {
6291 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6292 }
6293 return REDIS_OK;
6294 }
6295
6296 /* Get key or value object at current iteration position.
6297 * This increases the refcount of the field object by 1. */
6298 static robj *hashCurrent(hashIterator *hi, int what) {
6299 robj *o;
6300 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6301 if (what & REDIS_HASH_KEY) {
6302 o = createStringObject((char*)hi->zk,hi->zklen);
6303 } else {
6304 o = createStringObject((char*)hi->zv,hi->zvlen);
6305 }
6306 } else {
6307 if (what & REDIS_HASH_KEY) {
6308 o = dictGetEntryKey(hi->de);
6309 } else {
6310 o = dictGetEntryVal(hi->de);
6311 }
6312 incrRefCount(o);
6313 }
6314 return o;
6315 }
6316
6317 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6318 robj *o = lookupKeyWrite(c->db,key);
6319 if (o == NULL) {
6320 o = createHashObject();
6321 dictAdd(c->db->dict,key,o);
6322 incrRefCount(key);
6323 } else {
6324 if (o->type != REDIS_HASH) {
6325 addReply(c,shared.wrongtypeerr);
6326 return NULL;
6327 }
6328 }
6329 return o;
6330 }
6331
6332 /* ============================= Hash commands ============================== */
6333 static void hsetCommand(redisClient *c) {
6334 int update;
6335 robj *o;
6336
6337 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6338 hashTryConversion(o,c->argv,2,3);
6339 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6340 update = hashSet(o,c->argv[2],c->argv[3]);
6341 addReply(c, update ? shared.czero : shared.cone);
6342 server.dirty++;
6343 }
6344
6345 static void hsetnxCommand(redisClient *c) {
6346 robj *o;
6347 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6348 hashTryConversion(o,c->argv,2,3);
6349
6350 if (hashExists(o, c->argv[2])) {
6351 addReply(c, shared.czero);
6352 } else {
6353 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6354 hashSet(o,c->argv[2],c->argv[3]);
6355 addReply(c, shared.cone);
6356 server.dirty++;
6357 }
6358 }
6359
6360 static void hmsetCommand(redisClient *c) {
6361 int i;
6362 robj *o;
6363
6364 if ((c->argc % 2) == 1) {
6365 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6366 return;
6367 }
6368
6369 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6370 hashTryConversion(o,c->argv,2,c->argc-1);
6371 for (i = 2; i < c->argc; i += 2) {
6372 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6373 hashSet(o,c->argv[i],c->argv[i+1]);
6374 }
6375 addReply(c, shared.ok);
6376 server.dirty++;
6377 }
6378
6379 static void hincrbyCommand(redisClient *c) {
6380 long long value, incr;
6381 robj *o, *current, *new;
6382
6383 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6384 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6385 if ((current = hashGet(o,c->argv[2])) != NULL) {
6386 if (current->encoding == REDIS_ENCODING_RAW)
6387 value = strtoll(current->ptr,NULL,10);
6388 else if (current->encoding == REDIS_ENCODING_INT)
6389 value = (long)current->ptr;
6390 else
6391 redisAssert(1 != 1);
6392 decrRefCount(current);
6393 } else {
6394 value = 0;
6395 }
6396
6397 value += incr;
6398 new = createStringObjectFromLongLong(value);
6399 hashTryObjectEncoding(o,&c->argv[2],NULL);
6400 hashSet(o,c->argv[2],new);
6401 decrRefCount(new);
6402 addReplyLongLong(c,value);
6403 server.dirty++;
6404 }
6405
6406 static void hgetCommand(redisClient *c) {
6407 robj *o, *value;
6408 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6409 checkType(c,o,REDIS_HASH)) return;
6410
6411 if ((value = hashGet(o,c->argv[2])) != NULL) {
6412 addReplyBulk(c,value);
6413 decrRefCount(value);
6414 } else {
6415 addReply(c,shared.nullbulk);
6416 }
6417 }
6418
6419 static void hmgetCommand(redisClient *c) {
6420 int i;
6421 robj *o, *value;
6422 o = lookupKeyRead(c->db,c->argv[1]);
6423 if (o != NULL && o->type != REDIS_HASH) {
6424 addReply(c,shared.wrongtypeerr);
6425 }
6426
6427 /* Note the check for o != NULL happens inside the loop. This is
6428 * done because objects that cannot be found are considered to be
6429 * an empty hash. The reply should then be a series of NULLs. */
6430 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6431 for (i = 2; i < c->argc; i++) {
6432 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6433 addReplyBulk(c,value);
6434 decrRefCount(value);
6435 } else {
6436 addReply(c,shared.nullbulk);
6437 }
6438 }
6439 }
6440
6441 static void hdelCommand(redisClient *c) {
6442 robj *o;
6443 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6444 checkType(c,o,REDIS_HASH)) return;
6445
6446 if (hashDelete(o,c->argv[2])) {
6447 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6448 addReply(c,shared.cone);
6449 server.dirty++;
6450 } else {
6451 addReply(c,shared.czero);
6452 }
6453 }
6454
6455 static void hlenCommand(redisClient *c) {
6456 robj *o;
6457 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6458 checkType(c,o,REDIS_HASH)) return;
6459
6460 addReplyUlong(c,hashLength(o));
6461 }
6462
6463 static void genericHgetallCommand(redisClient *c, int flags) {
6464 robj *o, *lenobj, *obj;
6465 unsigned long count = 0;
6466 hashIterator *hi;
6467
6468 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6469 || checkType(c,o,REDIS_HASH)) return;
6470
6471 lenobj = createObject(REDIS_STRING,NULL);
6472 addReply(c,lenobj);
6473 decrRefCount(lenobj);
6474
6475 hi = hashInitIterator(o);
6476 while (hashNext(hi) != REDIS_ERR) {
6477 if (flags & REDIS_HASH_KEY) {
6478 obj = hashCurrent(hi,REDIS_HASH_KEY);
6479 addReplyBulk(c,obj);
6480 decrRefCount(obj);
6481 count++;
6482 }
6483 if (flags & REDIS_HASH_VALUE) {
6484 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6485 addReplyBulk(c,obj);
6486 decrRefCount(obj);
6487 count++;
6488 }
6489 }
6490 hashReleaseIterator(hi);
6491
6492 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6493 }
6494
6495 static void hkeysCommand(redisClient *c) {
6496 genericHgetallCommand(c,REDIS_HASH_KEY);
6497 }
6498
6499 static void hvalsCommand(redisClient *c) {
6500 genericHgetallCommand(c,REDIS_HASH_VALUE);
6501 }
6502
6503 static void hgetallCommand(redisClient *c) {
6504 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6505 }
6506
6507 static void hexistsCommand(redisClient *c) {
6508 robj *o;
6509 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6510 checkType(c,o,REDIS_HASH)) return;
6511
6512 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6513 }
6514
6515 static void convertToRealHash(robj *o) {
6516 unsigned char *key, *val, *p, *zm = o->ptr;
6517 unsigned int klen, vlen;
6518 dict *dict = dictCreate(&hashDictType,NULL);
6519
6520 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6521 p = zipmapRewind(zm);
6522 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6523 robj *keyobj, *valobj;
6524
6525 keyobj = createStringObject((char*)key,klen);
6526 valobj = createStringObject((char*)val,vlen);
6527 keyobj = tryObjectEncoding(keyobj);
6528 valobj = tryObjectEncoding(valobj);
6529 dictAdd(dict,keyobj,valobj);
6530 }
6531 o->encoding = REDIS_ENCODING_HT;
6532 o->ptr = dict;
6533 zfree(zm);
6534 }
6535
6536 /* ========================= Non type-specific commands ==================== */
6537
6538 static void flushdbCommand(redisClient *c) {
6539 server.dirty += dictSize(c->db->dict);
6540 dictEmpty(c->db->dict);
6541 dictEmpty(c->db->expires);
6542 addReply(c,shared.ok);
6543 }
6544
6545 static void flushallCommand(redisClient *c) {
6546 server.dirty += emptyDb();
6547 addReply(c,shared.ok);
6548 if (server.bgsavechildpid != -1) {
6549 kill(server.bgsavechildpid,SIGKILL);
6550 rdbRemoveTempFile(server.bgsavechildpid);
6551 }
6552 rdbSave(server.dbfilename);
6553 server.dirty++;
6554 }
6555
6556 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6557 redisSortOperation *so = zmalloc(sizeof(*so));
6558 so->type = type;
6559 so->pattern = pattern;
6560 return so;
6561 }
6562
6563 /* Return the value associated to the key with a name obtained
6564 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6565 * The returned object will always have its refcount increased by 1
6566 * when it is non-NULL. */
6567 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6568 char *p, *f;
6569 sds spat, ssub;
6570 robj keyobj, fieldobj, *o;
6571 int prefixlen, sublen, postfixlen, fieldlen;
6572 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6573 struct {
6574 long len;
6575 long free;
6576 char buf[REDIS_SORTKEY_MAX+1];
6577 } keyname, fieldname;
6578
6579 /* If the pattern is "#" return the substitution object itself in order
6580 * to implement the "SORT ... GET #" feature. */
6581 spat = pattern->ptr;
6582 if (spat[0] == '#' && spat[1] == '\0') {
6583 incrRefCount(subst);
6584 return subst;
6585 }
6586
6587 /* The substitution object may be specially encoded. If so we create
6588 * a decoded object on the fly. Otherwise getDecodedObject will just
6589 * increment the ref count, that we'll decrement later. */
6590 subst = getDecodedObject(subst);
6591
6592 ssub = subst->ptr;
6593 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6594 p = strchr(spat,'*');
6595 if (!p) {
6596 decrRefCount(subst);
6597 return NULL;
6598 }
6599
6600 /* Find out if we're dealing with a hash dereference. */
6601 if ((f = strstr(p+1, "->")) != NULL) {
6602 fieldlen = sdslen(spat)-(f-spat);
6603 /* this also copies \0 character */
6604 memcpy(fieldname.buf,f+2,fieldlen-1);
6605 fieldname.len = fieldlen-2;
6606 } else {
6607 fieldlen = 0;
6608 }
6609
6610 prefixlen = p-spat;
6611 sublen = sdslen(ssub);
6612 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6613 memcpy(keyname.buf,spat,prefixlen);
6614 memcpy(keyname.buf+prefixlen,ssub,sublen);
6615 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6616 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6617 keyname.len = prefixlen+sublen+postfixlen;
6618 decrRefCount(subst);
6619
6620 /* Lookup substituted key */
6621 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6622 o = lookupKeyRead(db,&keyobj);
6623 if (o == NULL) return NULL;
6624
6625 if (fieldlen > 0) {
6626 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6627
6628 /* Retrieve value from hash by the field name. This operation
6629 * already increases the refcount of the returned object. */
6630 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6631 o = hashGet(o, &fieldobj);
6632 } else {
6633 if (o->type != REDIS_STRING) return NULL;
6634
6635 /* Every object that this function returns needs to have its refcount
6636 * increased. sortCommand decreases it again. */
6637 incrRefCount(o);
6638 }
6639
6640 return o;
6641 }
6642
6643 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6644 * the additional parameter is not standard but a BSD-specific we have to
6645 * pass sorting parameters via the global 'server' structure */
6646 static int sortCompare(const void *s1, const void *s2) {
6647 const redisSortObject *so1 = s1, *so2 = s2;
6648 int cmp;
6649
6650 if (!server.sort_alpha) {
6651 /* Numeric sorting. Here it's trivial as we precomputed scores */
6652 if (so1->u.score > so2->u.score) {
6653 cmp = 1;
6654 } else if (so1->u.score < so2->u.score) {
6655 cmp = -1;
6656 } else {
6657 cmp = 0;
6658 }
6659 } else {
6660 /* Alphanumeric sorting */
6661 if (server.sort_bypattern) {
6662 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6663 /* At least one compare object is NULL */
6664 if (so1->u.cmpobj == so2->u.cmpobj)
6665 cmp = 0;
6666 else if (so1->u.cmpobj == NULL)
6667 cmp = -1;
6668 else
6669 cmp = 1;
6670 } else {
6671 /* We have both the objects, use strcoll */
6672 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6673 }
6674 } else {
6675 /* Compare elements directly. */
6676 cmp = compareStringObjects(so1->obj,so2->obj);
6677 }
6678 }
6679 return server.sort_desc ? -cmp : cmp;
6680 }
6681
6682 /* The SORT command is the most complex command in Redis. Warning: this code
6683 * is optimized for speed and a bit less for readability */
6684 static void sortCommand(redisClient *c) {
6685 list *operations;
6686 int outputlen = 0;
6687 int desc = 0, alpha = 0;
6688 int limit_start = 0, limit_count = -1, start, end;
6689 int j, dontsort = 0, vectorlen;
6690 int getop = 0; /* GET operation counter */
6691 robj *sortval, *sortby = NULL, *storekey = NULL;
6692 redisSortObject *vector; /* Resulting vector to sort */
6693
6694 /* Lookup the key to sort. It must be of the right types */
6695 sortval = lookupKeyRead(c->db,c->argv[1]);
6696 if (sortval == NULL) {
6697 addReply(c,shared.emptymultibulk);
6698 return;
6699 }
6700 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6701 sortval->type != REDIS_ZSET)
6702 {
6703 addReply(c,shared.wrongtypeerr);
6704 return;
6705 }
6706
6707 /* Create a list of operations to perform for every sorted element.
6708 * Operations can be GET/DEL/INCR/DECR */
6709 operations = listCreate();
6710 listSetFreeMethod(operations,zfree);
6711 j = 2;
6712
6713 /* Now we need to protect sortval incrementing its count, in the future
6714 * SORT may have options able to overwrite/delete keys during the sorting
6715 * and the sorted key itself may get destroied */
6716 incrRefCount(sortval);
6717
6718 /* The SORT command has an SQL-alike syntax, parse it */
6719 while(j < c->argc) {
6720 int leftargs = c->argc-j-1;
6721 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6722 desc = 0;
6723 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6724 desc = 1;
6725 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6726 alpha = 1;
6727 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6728 limit_start = atoi(c->argv[j+1]->ptr);
6729 limit_count = atoi(c->argv[j+2]->ptr);
6730 j+=2;
6731 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6732 storekey = c->argv[j+1];
6733 j++;
6734 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6735 sortby = c->argv[j+1];
6736 /* If the BY pattern does not contain '*', i.e. it is constant,
6737 * we don't need to sort nor to lookup the weight keys. */
6738 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6739 j++;
6740 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6741 listAddNodeTail(operations,createSortOperation(
6742 REDIS_SORT_GET,c->argv[j+1]));
6743 getop++;
6744 j++;
6745 } else {
6746 decrRefCount(sortval);
6747 listRelease(operations);
6748 addReply(c,shared.syntaxerr);
6749 return;
6750 }
6751 j++;
6752 }
6753
6754 /* Load the sorting vector with all the objects to sort */
6755 switch(sortval->type) {
6756 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6757 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6758 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6759 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6760 }
6761 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6762 j = 0;
6763
6764 if (sortval->type == REDIS_LIST) {
6765 list *list = sortval->ptr;
6766 listNode *ln;
6767 listIter li;
6768
6769 listRewind(list,&li);
6770 while((ln = listNext(&li))) {
6771 robj *ele = ln->value;
6772 vector[j].obj = ele;
6773 vector[j].u.score = 0;
6774 vector[j].u.cmpobj = NULL;
6775 j++;
6776 }
6777 } else {
6778 dict *set;
6779 dictIterator *di;
6780 dictEntry *setele;
6781
6782 if (sortval->type == REDIS_SET) {
6783 set = sortval->ptr;
6784 } else {
6785 zset *zs = sortval->ptr;
6786 set = zs->dict;
6787 }
6788
6789 di = dictGetIterator(set);
6790 while((setele = dictNext(di)) != NULL) {
6791 vector[j].obj = dictGetEntryKey(setele);
6792 vector[j].u.score = 0;
6793 vector[j].u.cmpobj = NULL;
6794 j++;
6795 }
6796 dictReleaseIterator(di);
6797 }
6798 redisAssert(j == vectorlen);
6799
6800 /* Now it's time to load the right scores in the sorting vector */
6801 if (dontsort == 0) {
6802 for (j = 0; j < vectorlen; j++) {
6803 robj *byval;
6804 if (sortby) {
6805 /* lookup value to sort by */
6806 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6807 if (!byval) continue;
6808 } else {
6809 /* use object itself to sort by */
6810 byval = vector[j].obj;
6811 }
6812
6813 if (alpha) {
6814 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6815 } else {
6816 if (byval->encoding == REDIS_ENCODING_RAW) {
6817 vector[j].u.score = strtod(byval->ptr,NULL);
6818 } else if (byval->encoding == REDIS_ENCODING_INT) {
6819 /* Don't need to decode the object if it's
6820 * integer-encoded (the only encoding supported) so
6821 * far. We can just cast it */
6822 vector[j].u.score = (long)byval->ptr;
6823 } else {
6824 redisAssert(1 != 1);
6825 }
6826 }
6827
6828 /* when the object was retrieved using lookupKeyByPattern,
6829 * its refcount needs to be decreased. */
6830 if (sortby) {
6831 decrRefCount(byval);
6832 }
6833 }
6834 }
6835
6836 /* We are ready to sort the vector... perform a bit of sanity check
6837 * on the LIMIT option too. We'll use a partial version of quicksort. */
6838 start = (limit_start < 0) ? 0 : limit_start;
6839 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6840 if (start >= vectorlen) {
6841 start = vectorlen-1;
6842 end = vectorlen-2;
6843 }
6844 if (end >= vectorlen) end = vectorlen-1;
6845
6846 if (dontsort == 0) {
6847 server.sort_desc = desc;
6848 server.sort_alpha = alpha;
6849 server.sort_bypattern = sortby ? 1 : 0;
6850 if (sortby && (start != 0 || end != vectorlen-1))
6851 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6852 else
6853 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6854 }
6855
6856 /* Send command output to the output buffer, performing the specified
6857 * GET/DEL/INCR/DECR operations if any. */
6858 outputlen = getop ? getop*(end-start+1) : end-start+1;
6859 if (storekey == NULL) {
6860 /* STORE option not specified, sent the sorting result to client */
6861 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6862 for (j = start; j <= end; j++) {
6863 listNode *ln;
6864 listIter li;
6865
6866 if (!getop) addReplyBulk(c,vector[j].obj);
6867 listRewind(operations,&li);
6868 while((ln = listNext(&li))) {
6869 redisSortOperation *sop = ln->value;
6870 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6871 vector[j].obj);
6872
6873 if (sop->type == REDIS_SORT_GET) {
6874 if (!val) {
6875 addReply(c,shared.nullbulk);
6876 } else {
6877 addReplyBulk(c,val);
6878 decrRefCount(val);
6879 }
6880 } else {
6881 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6882 }
6883 }
6884 }
6885 } else {
6886 robj *listObject = createListObject();
6887 list *listPtr = (list*) listObject->ptr;
6888
6889 /* STORE option specified, set the sorting result as a List object */
6890 for (j = start; j <= end; j++) {
6891 listNode *ln;
6892 listIter li;
6893
6894 if (!getop) {
6895 listAddNodeTail(listPtr,vector[j].obj);
6896 incrRefCount(vector[j].obj);
6897 }
6898 listRewind(operations,&li);
6899 while((ln = listNext(&li))) {
6900 redisSortOperation *sop = ln->value;
6901 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6902 vector[j].obj);
6903
6904 if (sop->type == REDIS_SORT_GET) {
6905 if (!val) {
6906 listAddNodeTail(listPtr,createStringObject("",0));
6907 } else {
6908 /* We should do a incrRefCount on val because it is
6909 * added to the list, but also a decrRefCount because
6910 * it is returned by lookupKeyByPattern. This results
6911 * in doing nothing at all. */
6912 listAddNodeTail(listPtr,val);
6913 }
6914 } else {
6915 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6916 }
6917 }
6918 }
6919 if (dictReplace(c->db->dict,storekey,listObject)) {
6920 incrRefCount(storekey);
6921 }
6922 /* Note: we add 1 because the DB is dirty anyway since even if the
6923 * SORT result is empty a new key is set and maybe the old content
6924 * replaced. */
6925 server.dirty += 1+outputlen;
6926 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6927 }
6928
6929 /* Cleanup */
6930 decrRefCount(sortval);
6931 listRelease(operations);
6932 for (j = 0; j < vectorlen; j++) {
6933 if (alpha && vector[j].u.cmpobj)
6934 decrRefCount(vector[j].u.cmpobj);
6935 }
6936 zfree(vector);
6937 }
6938
6939 /* Convert an amount of bytes into a human readable string in the form
6940 * of 100B, 2G, 100M, 4K, and so forth. */
6941 static void bytesToHuman(char *s, unsigned long long n) {
6942 double d;
6943
6944 if (n < 1024) {
6945 /* Bytes */
6946 sprintf(s,"%lluB",n);
6947 return;
6948 } else if (n < (1024*1024)) {
6949 d = (double)n/(1024);
6950 sprintf(s,"%.2fK",d);
6951 } else if (n < (1024LL*1024*1024)) {
6952 d = (double)n/(1024*1024);
6953 sprintf(s,"%.2fM",d);
6954 } else if (n < (1024LL*1024*1024*1024)) {
6955 d = (double)n/(1024LL*1024*1024);
6956 sprintf(s,"%.2fG",d);
6957 }
6958 }
6959
6960 /* Create the string returned by the INFO command. This is decoupled
6961 * by the INFO command itself as we need to report the same information
6962 * on memory corruption problems. */
6963 static sds genRedisInfoString(void) {
6964 sds info;
6965 time_t uptime = time(NULL)-server.stat_starttime;
6966 int j;
6967 char hmem[64];
6968
6969 bytesToHuman(hmem,zmalloc_used_memory());
6970 info = sdscatprintf(sdsempty(),
6971 "redis_version:%s\r\n"
6972 "arch_bits:%s\r\n"
6973 "multiplexing_api:%s\r\n"
6974 "process_id:%ld\r\n"
6975 "uptime_in_seconds:%ld\r\n"
6976 "uptime_in_days:%ld\r\n"
6977 "connected_clients:%d\r\n"
6978 "connected_slaves:%d\r\n"
6979 "blocked_clients:%d\r\n"
6980 "used_memory:%zu\r\n"
6981 "used_memory_human:%s\r\n"
6982 "changes_since_last_save:%lld\r\n"
6983 "bgsave_in_progress:%d\r\n"
6984 "last_save_time:%ld\r\n"
6985 "bgrewriteaof_in_progress:%d\r\n"
6986 "total_connections_received:%lld\r\n"
6987 "total_commands_processed:%lld\r\n"
6988 "expired_keys:%lld\r\n"
6989 "hash_max_zipmap_entries:%ld\r\n"
6990 "hash_max_zipmap_value:%ld\r\n"
6991 "pubsub_channels:%ld\r\n"
6992 "pubsub_patterns:%u\r\n"
6993 "vm_enabled:%d\r\n"
6994 "role:%s\r\n"
6995 ,REDIS_VERSION,
6996 (sizeof(long) == 8) ? "64" : "32",
6997 aeGetApiName(),
6998 (long) getpid(),
6999 uptime,
7000 uptime/(3600*24),
7001 listLength(server.clients)-listLength(server.slaves),
7002 listLength(server.slaves),
7003 server.blpop_blocked_clients,
7004 zmalloc_used_memory(),
7005 hmem,
7006 server.dirty,
7007 server.bgsavechildpid != -1,
7008 server.lastsave,
7009 server.bgrewritechildpid != -1,
7010 server.stat_numconnections,
7011 server.stat_numcommands,
7012 server.stat_expiredkeys,
7013 server.hash_max_zipmap_entries,
7014 server.hash_max_zipmap_value,
7015 dictSize(server.pubsub_channels),
7016 listLength(server.pubsub_patterns),
7017 server.vm_enabled != 0,
7018 server.masterhost == NULL ? "master" : "slave"
7019 );
7020 if (server.masterhost) {
7021 info = sdscatprintf(info,
7022 "master_host:%s\r\n"
7023 "master_port:%d\r\n"
7024 "master_link_status:%s\r\n"
7025 "master_last_io_seconds_ago:%d\r\n"
7026 ,server.masterhost,
7027 server.masterport,
7028 (server.replstate == REDIS_REPL_CONNECTED) ?
7029 "up" : "down",
7030 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
7031 );
7032 }
7033 if (server.vm_enabled) {
7034 lockThreadedIO();
7035 info = sdscatprintf(info,
7036 "vm_conf_max_memory:%llu\r\n"
7037 "vm_conf_page_size:%llu\r\n"
7038 "vm_conf_pages:%llu\r\n"
7039 "vm_stats_used_pages:%llu\r\n"
7040 "vm_stats_swapped_objects:%llu\r\n"
7041 "vm_stats_swappin_count:%llu\r\n"
7042 "vm_stats_swappout_count:%llu\r\n"
7043 "vm_stats_io_newjobs_len:%lu\r\n"
7044 "vm_stats_io_processing_len:%lu\r\n"
7045 "vm_stats_io_processed_len:%lu\r\n"
7046 "vm_stats_io_active_threads:%lu\r\n"
7047 "vm_stats_blocked_clients:%lu\r\n"
7048 ,(unsigned long long) server.vm_max_memory,
7049 (unsigned long long) server.vm_page_size,
7050 (unsigned long long) server.vm_pages,
7051 (unsigned long long) server.vm_stats_used_pages,
7052 (unsigned long long) server.vm_stats_swapped_objects,
7053 (unsigned long long) server.vm_stats_swapins,
7054 (unsigned long long) server.vm_stats_swapouts,
7055 (unsigned long) listLength(server.io_newjobs),
7056 (unsigned long) listLength(server.io_processing),
7057 (unsigned long) listLength(server.io_processed),
7058 (unsigned long) server.io_active_threads,
7059 (unsigned long) server.vm_blocked_clients
7060 );
7061 unlockThreadedIO();
7062 }
7063 for (j = 0; j < server.dbnum; j++) {
7064 long long keys, vkeys;
7065
7066 keys = dictSize(server.db[j].dict);
7067 vkeys = dictSize(server.db[j].expires);
7068 if (keys || vkeys) {
7069 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7070 j, keys, vkeys);
7071 }
7072 }
7073 return info;
7074 }
7075
7076 static void infoCommand(redisClient *c) {
7077 sds info = genRedisInfoString();
7078 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7079 (unsigned long)sdslen(info)));
7080 addReplySds(c,info);
7081 addReply(c,shared.crlf);
7082 }
7083
7084 static void monitorCommand(redisClient *c) {
7085 /* ignore MONITOR if aleady slave or in monitor mode */
7086 if (c->flags & REDIS_SLAVE) return;
7087
7088 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7089 c->slaveseldb = 0;
7090 listAddNodeTail(server.monitors,c);
7091 addReply(c,shared.ok);
7092 }
7093
7094 /* ================================= Expire ================================= */
7095 static int removeExpire(redisDb *db, robj *key) {
7096 if (dictDelete(db->expires,key) == DICT_OK) {
7097 return 1;
7098 } else {
7099 return 0;
7100 }
7101 }
7102
7103 static int setExpire(redisDb *db, robj *key, time_t when) {
7104 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7105 return 0;
7106 } else {
7107 incrRefCount(key);
7108 return 1;
7109 }
7110 }
7111
7112 /* Return the expire time of the specified key, or -1 if no expire
7113 * is associated with this key (i.e. the key is non volatile) */
7114 static time_t getExpire(redisDb *db, robj *key) {
7115 dictEntry *de;
7116
7117 /* No expire? return ASAP */
7118 if (dictSize(db->expires) == 0 ||
7119 (de = dictFind(db->expires,key)) == NULL) return -1;
7120
7121 return (time_t) dictGetEntryVal(de);
7122 }
7123
7124 static int expireIfNeeded(redisDb *db, robj *key) {
7125 time_t when;
7126 dictEntry *de;
7127
7128 /* No expire? return ASAP */
7129 if (dictSize(db->expires) == 0 ||
7130 (de = dictFind(db->expires,key)) == NULL) return 0;
7131
7132 /* Lookup the expire */
7133 when = (time_t) dictGetEntryVal(de);
7134 if (time(NULL) <= when) return 0;
7135
7136 /* Delete the key */
7137 dictDelete(db->expires,key);
7138 server.stat_expiredkeys++;
7139 return dictDelete(db->dict,key) == DICT_OK;
7140 }
7141
7142 static int deleteIfVolatile(redisDb *db, robj *key) {
7143 dictEntry *de;
7144
7145 /* No expire? return ASAP */
7146 if (dictSize(db->expires) == 0 ||
7147 (de = dictFind(db->expires,key)) == NULL) return 0;
7148
7149 /* Delete the key */
7150 server.dirty++;
7151 server.stat_expiredkeys++;
7152 dictDelete(db->expires,key);
7153 return dictDelete(db->dict,key) == DICT_OK;
7154 }
7155
7156 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7157 dictEntry *de;
7158 time_t seconds;
7159
7160 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7161
7162 seconds -= offset;
7163
7164 de = dictFind(c->db->dict,key);
7165 if (de == NULL) {
7166 addReply(c,shared.czero);
7167 return;
7168 }
7169 if (seconds <= 0) {
7170 if (deleteKey(c->db,key)) server.dirty++;
7171 addReply(c, shared.cone);
7172 return;
7173 } else {
7174 time_t when = time(NULL)+seconds;
7175 if (setExpire(c->db,key,when)) {
7176 addReply(c,shared.cone);
7177 server.dirty++;
7178 } else {
7179 addReply(c,shared.czero);
7180 }
7181 return;
7182 }
7183 }
7184
7185 static void expireCommand(redisClient *c) {
7186 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7187 }
7188
7189 static void expireatCommand(redisClient *c) {
7190 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7191 }
7192
7193 static void ttlCommand(redisClient *c) {
7194 time_t expire;
7195 int ttl = -1;
7196
7197 expire = getExpire(c->db,c->argv[1]);
7198 if (expire != -1) {
7199 ttl = (int) (expire-time(NULL));
7200 if (ttl < 0) ttl = -1;
7201 }
7202 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7203 }
7204
7205 /* ================================ MULTI/EXEC ============================== */
7206
7207 /* Client state initialization for MULTI/EXEC */
7208 static void initClientMultiState(redisClient *c) {
7209 c->mstate.commands = NULL;
7210 c->mstate.count = 0;
7211 }
7212
7213 /* Release all the resources associated with MULTI/EXEC state */
7214 static void freeClientMultiState(redisClient *c) {
7215 int j;
7216
7217 for (j = 0; j < c->mstate.count; j++) {
7218 int i;
7219 multiCmd *mc = c->mstate.commands+j;
7220
7221 for (i = 0; i < mc->argc; i++)
7222 decrRefCount(mc->argv[i]);
7223 zfree(mc->argv);
7224 }
7225 zfree(c->mstate.commands);
7226 }
7227
7228 /* Add a new command into the MULTI commands queue */
7229 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7230 multiCmd *mc;
7231 int j;
7232
7233 c->mstate.commands = zrealloc(c->mstate.commands,
7234 sizeof(multiCmd)*(c->mstate.count+1));
7235 mc = c->mstate.commands+c->mstate.count;
7236 mc->cmd = cmd;
7237 mc->argc = c->argc;
7238 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7239 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7240 for (j = 0; j < c->argc; j++)
7241 incrRefCount(mc->argv[j]);
7242 c->mstate.count++;
7243 }
7244
7245 static void multiCommand(redisClient *c) {
7246 c->flags |= REDIS_MULTI;
7247 addReply(c,shared.ok);
7248 }
7249
7250 static void discardCommand(redisClient *c) {
7251 if (!(c->flags & REDIS_MULTI)) {
7252 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7253 return;
7254 }
7255
7256 freeClientMultiState(c);
7257 initClientMultiState(c);
7258 c->flags &= (~REDIS_MULTI);
7259 addReply(c,shared.ok);
7260 }
7261
7262 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7263 * implememntation for more information. */
7264 static void execCommandReplicateMulti(redisClient *c) {
7265 struct redisCommand *cmd;
7266 robj *multistring = createStringObject("MULTI",5);
7267
7268 cmd = lookupCommand("multi");
7269 if (server.appendonly)
7270 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7271 if (listLength(server.slaves))
7272 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7273 decrRefCount(multistring);
7274 }
7275
7276 static void execCommand(redisClient *c) {
7277 int j;
7278 robj **orig_argv;
7279 int orig_argc;
7280
7281 if (!(c->flags & REDIS_MULTI)) {
7282 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7283 return;
7284 }
7285
7286 /* Replicate a MULTI request now that we are sure the block is executed.
7287 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7288 * both the AOF and the replication link will have the same consistency
7289 * and atomicity guarantees. */
7290 execCommandReplicateMulti(c);
7291
7292 /* Exec all the queued commands */
7293 orig_argv = c->argv;
7294 orig_argc = c->argc;
7295 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7296 for (j = 0; j < c->mstate.count; j++) {
7297 c->argc = c->mstate.commands[j].argc;
7298 c->argv = c->mstate.commands[j].argv;
7299 call(c,c->mstate.commands[j].cmd);
7300 }
7301 c->argv = orig_argv;
7302 c->argc = orig_argc;
7303 freeClientMultiState(c);
7304 initClientMultiState(c);
7305 c->flags &= (~REDIS_MULTI);
7306 /* Make sure the EXEC command is always replicated / AOF, since we
7307 * always send the MULTI command (we can't know beforehand if the
7308 * next operations will contain at least a modification to the DB). */
7309 server.dirty++;
7310 }
7311
7312 /* =========================== Blocking Operations ========================= */
7313
7314 /* Currently Redis blocking operations support is limited to list POP ops,
7315 * so the current implementation is not fully generic, but it is also not
7316 * completely specific so it will not require a rewrite to support new
7317 * kind of blocking operations in the future.
7318 *
7319 * Still it's important to note that list blocking operations can be already
7320 * used as a notification mechanism in order to implement other blocking
7321 * operations at application level, so there must be a very strong evidence
7322 * of usefulness and generality before new blocking operations are implemented.
7323 *
7324 * This is how the current blocking POP works, we use BLPOP as example:
7325 * - If the user calls BLPOP and the key exists and contains a non empty list
7326 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7327 * if there is not to block.
7328 * - If instead BLPOP is called and the key does not exists or the list is
7329 * empty we need to block. In order to do so we remove the notification for
7330 * new data to read in the client socket (so that we'll not serve new
7331 * requests if the blocking request is not served). Also we put the client
7332 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7333 * blocking for this keys.
7334 * - If a PUSH operation against a key with blocked clients waiting is
7335 * performed, we serve the first in the list: basically instead to push
7336 * the new element inside the list we return it to the (first / oldest)
7337 * blocking client, unblock the client, and remove it form the list.
7338 *
7339 * The above comment and the source code should be enough in order to understand
7340 * the implementation and modify / fix it later.
7341 */
7342
7343 /* Set a client in blocking mode for the specified key, with the specified
7344 * timeout */
7345 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7346 dictEntry *de;
7347 list *l;
7348 int j;
7349
7350 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7351 c->blockingkeysnum = numkeys;
7352 c->blockingto = timeout;
7353 for (j = 0; j < numkeys; j++) {
7354 /* Add the key in the client structure, to map clients -> keys */
7355 c->blockingkeys[j] = keys[j];
7356 incrRefCount(keys[j]);
7357
7358 /* And in the other "side", to map keys -> clients */
7359 de = dictFind(c->db->blockingkeys,keys[j]);
7360 if (de == NULL) {
7361 int retval;
7362
7363 /* For every key we take a list of clients blocked for it */
7364 l = listCreate();
7365 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7366 incrRefCount(keys[j]);
7367 assert(retval == DICT_OK);
7368 } else {
7369 l = dictGetEntryVal(de);
7370 }
7371 listAddNodeTail(l,c);
7372 }
7373 /* Mark the client as a blocked client */
7374 c->flags |= REDIS_BLOCKED;
7375 server.blpop_blocked_clients++;
7376 }
7377
7378 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7379 static void unblockClientWaitingData(redisClient *c) {
7380 dictEntry *de;
7381 list *l;
7382 int j;
7383
7384 assert(c->blockingkeys != NULL);
7385 /* The client may wait for multiple keys, so unblock it for every key. */
7386 for (j = 0; j < c->blockingkeysnum; j++) {
7387 /* Remove this client from the list of clients waiting for this key. */
7388 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7389 assert(de != NULL);
7390 l = dictGetEntryVal(de);
7391 listDelNode(l,listSearchKey(l,c));
7392 /* If the list is empty we need to remove it to avoid wasting memory */
7393 if (listLength(l) == 0)
7394 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7395 decrRefCount(c->blockingkeys[j]);
7396 }
7397 /* Cleanup the client structure */
7398 zfree(c->blockingkeys);
7399 c->blockingkeys = NULL;
7400 c->flags &= (~REDIS_BLOCKED);
7401 server.blpop_blocked_clients--;
7402 /* We want to process data if there is some command waiting
7403 * in the input buffer. Note that this is safe even if
7404 * unblockClientWaitingData() gets called from freeClient() because
7405 * freeClient() will be smart enough to call this function
7406 * *after* c->querybuf was set to NULL. */
7407 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7408 }
7409
7410 /* This should be called from any function PUSHing into lists.
7411 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7412 * 'ele' is the element pushed.
7413 *
7414 * If the function returns 0 there was no client waiting for a list push
7415 * against this key.
7416 *
7417 * If the function returns 1 there was a client waiting for a list push
7418 * against this key, the element was passed to this client thus it's not
7419 * needed to actually add it to the list and the caller should return asap. */
7420 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7421 struct dictEntry *de;
7422 redisClient *receiver;
7423 list *l;
7424 listNode *ln;
7425
7426 de = dictFind(c->db->blockingkeys,key);
7427 if (de == NULL) return 0;
7428 l = dictGetEntryVal(de);
7429 ln = listFirst(l);
7430 assert(ln != NULL);
7431 receiver = ln->value;
7432
7433 addReplySds(receiver,sdsnew("*2\r\n"));
7434 addReplyBulk(receiver,key);
7435 addReplyBulk(receiver,ele);
7436 unblockClientWaitingData(receiver);
7437 return 1;
7438 }
7439
7440 /* Blocking RPOP/LPOP */
7441 static void blockingPopGenericCommand(redisClient *c, int where) {
7442 robj *o;
7443 time_t timeout;
7444 int j;
7445
7446 for (j = 1; j < c->argc-1; j++) {
7447 o = lookupKeyWrite(c->db,c->argv[j]);
7448 if (o != NULL) {
7449 if (o->type != REDIS_LIST) {
7450 addReply(c,shared.wrongtypeerr);
7451 return;
7452 } else {
7453 list *list = o->ptr;
7454 if (listLength(list) != 0) {
7455 /* If the list contains elements fall back to the usual
7456 * non-blocking POP operation */
7457 robj *argv[2], **orig_argv;
7458 int orig_argc;
7459
7460 /* We need to alter the command arguments before to call
7461 * popGenericCommand() as the command takes a single key. */
7462 orig_argv = c->argv;
7463 orig_argc = c->argc;
7464 argv[1] = c->argv[j];
7465 c->argv = argv;
7466 c->argc = 2;
7467
7468 /* Also the return value is different, we need to output
7469 * the multi bulk reply header and the key name. The
7470 * "real" command will add the last element (the value)
7471 * for us. If this souds like an hack to you it's just
7472 * because it is... */
7473 addReplySds(c,sdsnew("*2\r\n"));
7474 addReplyBulk(c,argv[1]);
7475 popGenericCommand(c,where);
7476
7477 /* Fix the client structure with the original stuff */
7478 c->argv = orig_argv;
7479 c->argc = orig_argc;
7480 return;
7481 }
7482 }
7483 }
7484 }
7485 /* If the list is empty or the key does not exists we must block */
7486 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7487 if (timeout > 0) timeout += time(NULL);
7488 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7489 }
7490
7491 static void blpopCommand(redisClient *c) {
7492 blockingPopGenericCommand(c,REDIS_HEAD);
7493 }
7494
7495 static void brpopCommand(redisClient *c) {
7496 blockingPopGenericCommand(c,REDIS_TAIL);
7497 }
7498
7499 /* =============================== Replication ============================= */
7500
7501 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7502 ssize_t nwritten, ret = size;
7503 time_t start = time(NULL);
7504
7505 timeout++;
7506 while(size) {
7507 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7508 nwritten = write(fd,ptr,size);
7509 if (nwritten == -1) return -1;
7510 ptr += nwritten;
7511 size -= nwritten;
7512 }
7513 if ((time(NULL)-start) > timeout) {
7514 errno = ETIMEDOUT;
7515 return -1;
7516 }
7517 }
7518 return ret;
7519 }
7520
7521 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7522 ssize_t nread, totread = 0;
7523 time_t start = time(NULL);
7524
7525 timeout++;
7526 while(size) {
7527 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7528 nread = read(fd,ptr,size);
7529 if (nread == -1) return -1;
7530 ptr += nread;
7531 size -= nread;
7532 totread += nread;
7533 }
7534 if ((time(NULL)-start) > timeout) {
7535 errno = ETIMEDOUT;
7536 return -1;
7537 }
7538 }
7539 return totread;
7540 }
7541
7542 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7543 ssize_t nread = 0;
7544
7545 size--;
7546 while(size) {
7547 char c;
7548
7549 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7550 if (c == '\n') {
7551 *ptr = '\0';
7552 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7553 return nread;
7554 } else {
7555 *ptr++ = c;
7556 *ptr = '\0';
7557 nread++;
7558 }
7559 }
7560 return nread;
7561 }
7562
7563 static void syncCommand(redisClient *c) {
7564 /* ignore SYNC if aleady slave or in monitor mode */
7565 if (c->flags & REDIS_SLAVE) return;
7566
7567 /* SYNC can't be issued when the server has pending data to send to
7568 * the client about already issued commands. We need a fresh reply
7569 * buffer registering the differences between the BGSAVE and the current
7570 * dataset, so that we can copy to other slaves if needed. */
7571 if (listLength(c->reply) != 0) {
7572 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7573 return;
7574 }
7575
7576 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7577 /* Here we need to check if there is a background saving operation
7578 * in progress, or if it is required to start one */
7579 if (server.bgsavechildpid != -1) {
7580 /* Ok a background save is in progress. Let's check if it is a good
7581 * one for replication, i.e. if there is another slave that is
7582 * registering differences since the server forked to save */
7583 redisClient *slave;
7584 listNode *ln;
7585 listIter li;
7586
7587 listRewind(server.slaves,&li);
7588 while((ln = listNext(&li))) {
7589 slave = ln->value;
7590 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7591 }
7592 if (ln) {
7593 /* Perfect, the server is already registering differences for
7594 * another slave. Set the right state, and copy the buffer. */
7595 listRelease(c->reply);
7596 c->reply = listDup(slave->reply);
7597 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7598 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7599 } else {
7600 /* No way, we need to wait for the next BGSAVE in order to
7601 * register differences */
7602 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7603 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7604 }
7605 } else {
7606 /* Ok we don't have a BGSAVE in progress, let's start one */
7607 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7608 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7609 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7610 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7611 return;
7612 }
7613 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7614 }
7615 c->repldbfd = -1;
7616 c->flags |= REDIS_SLAVE;
7617 c->slaveseldb = 0;
7618 listAddNodeTail(server.slaves,c);
7619 return;
7620 }
7621
7622 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7623 redisClient *slave = privdata;
7624 REDIS_NOTUSED(el);
7625 REDIS_NOTUSED(mask);
7626 char buf[REDIS_IOBUF_LEN];
7627 ssize_t nwritten, buflen;
7628
7629 if (slave->repldboff == 0) {
7630 /* Write the bulk write count before to transfer the DB. In theory here
7631 * we don't know how much room there is in the output buffer of the
7632 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7633 * operations) will never be smaller than the few bytes we need. */
7634 sds bulkcount;
7635
7636 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7637 slave->repldbsize);
7638 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7639 {
7640 sdsfree(bulkcount);
7641 freeClient(slave);
7642 return;
7643 }
7644 sdsfree(bulkcount);
7645 }
7646 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7647 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7648 if (buflen <= 0) {
7649 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7650 (buflen == 0) ? "premature EOF" : strerror(errno));
7651 freeClient(slave);
7652 return;
7653 }
7654 if ((nwritten = write(fd,buf,buflen)) == -1) {
7655 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7656 strerror(errno));
7657 freeClient(slave);
7658 return;
7659 }
7660 slave->repldboff += nwritten;
7661 if (slave->repldboff == slave->repldbsize) {
7662 close(slave->repldbfd);
7663 slave->repldbfd = -1;
7664 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7665 slave->replstate = REDIS_REPL_ONLINE;
7666 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7667 sendReplyToClient, slave) == AE_ERR) {
7668 freeClient(slave);
7669 return;
7670 }
7671 addReplySds(slave,sdsempty());
7672 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7673 }
7674 }
7675
7676 /* This function is called at the end of every backgrond saving.
7677 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7678 * otherwise REDIS_ERR is passed to the function.
7679 *
7680 * The goal of this function is to handle slaves waiting for a successful
7681 * background saving in order to perform non-blocking synchronization. */
7682 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7683 listNode *ln;
7684 int startbgsave = 0;
7685 listIter li;
7686
7687 listRewind(server.slaves,&li);
7688 while((ln = listNext(&li))) {
7689 redisClient *slave = ln->value;
7690
7691 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7692 startbgsave = 1;
7693 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7694 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7695 struct redis_stat buf;
7696
7697 if (bgsaveerr != REDIS_OK) {
7698 freeClient(slave);
7699 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7700 continue;
7701 }
7702 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7703 redis_fstat(slave->repldbfd,&buf) == -1) {
7704 freeClient(slave);
7705 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7706 continue;
7707 }
7708 slave->repldboff = 0;
7709 slave->repldbsize = buf.st_size;
7710 slave->replstate = REDIS_REPL_SEND_BULK;
7711 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7712 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7713 freeClient(slave);
7714 continue;
7715 }
7716 }
7717 }
7718 if (startbgsave) {
7719 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7720 listIter li;
7721
7722 listRewind(server.slaves,&li);
7723 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7724 while((ln = listNext(&li))) {
7725 redisClient *slave = ln->value;
7726
7727 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7728 freeClient(slave);
7729 }
7730 }
7731 }
7732 }
7733
7734 static int syncWithMaster(void) {
7735 char buf[1024], tmpfile[256], authcmd[1024];
7736 long dumpsize;
7737 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7738 int dfd, maxtries = 5;
7739
7740 if (fd == -1) {
7741 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7742 strerror(errno));
7743 return REDIS_ERR;
7744 }
7745
7746 /* AUTH with the master if required. */
7747 if(server.masterauth) {
7748 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7749 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7750 close(fd);
7751 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7752 strerror(errno));
7753 return REDIS_ERR;
7754 }
7755 /* Read the AUTH result. */
7756 if (syncReadLine(fd,buf,1024,3600) == -1) {
7757 close(fd);
7758 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7759 strerror(errno));
7760 return REDIS_ERR;
7761 }
7762 if (buf[0] != '+') {
7763 close(fd);
7764 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7765 return REDIS_ERR;
7766 }
7767 }
7768
7769 /* Issue the SYNC command */
7770 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7771 close(fd);
7772 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7773 strerror(errno));
7774 return REDIS_ERR;
7775 }
7776 /* Read the bulk write count */
7777 if (syncReadLine(fd,buf,1024,3600) == -1) {
7778 close(fd);
7779 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7780 strerror(errno));
7781 return REDIS_ERR;
7782 }
7783 if (buf[0] != '$') {
7784 close(fd);
7785 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7786 return REDIS_ERR;
7787 }
7788 dumpsize = strtol(buf+1,NULL,10);
7789 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7790 /* Read the bulk write data on a temp file */
7791 while(maxtries--) {
7792 snprintf(tmpfile,256,
7793 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7794 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7795 if (dfd != -1) break;
7796 sleep(1);
7797 }
7798 if (dfd == -1) {
7799 close(fd);
7800 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7801 return REDIS_ERR;
7802 }
7803 while(dumpsize) {
7804 int nread, nwritten;
7805
7806 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7807 if (nread == -1) {
7808 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7809 strerror(errno));
7810 close(fd);
7811 close(dfd);
7812 return REDIS_ERR;
7813 }
7814 nwritten = write(dfd,buf,nread);
7815 if (nwritten == -1) {
7816 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7817 close(fd);
7818 close(dfd);
7819 return REDIS_ERR;
7820 }
7821 dumpsize -= nread;
7822 }
7823 close(dfd);
7824 if (rename(tmpfile,server.dbfilename) == -1) {
7825 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7826 unlink(tmpfile);
7827 close(fd);
7828 return REDIS_ERR;
7829 }
7830 emptyDb();
7831 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7832 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7833 close(fd);
7834 return REDIS_ERR;
7835 }
7836 server.master = createClient(fd);
7837 server.master->flags |= REDIS_MASTER;
7838 server.master->authenticated = 1;
7839 server.replstate = REDIS_REPL_CONNECTED;
7840 return REDIS_OK;
7841 }
7842
7843 static void slaveofCommand(redisClient *c) {
7844 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7845 !strcasecmp(c->argv[2]->ptr,"one")) {
7846 if (server.masterhost) {
7847 sdsfree(server.masterhost);
7848 server.masterhost = NULL;
7849 if (server.master) freeClient(server.master);
7850 server.replstate = REDIS_REPL_NONE;
7851 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7852 }
7853 } else {
7854 sdsfree(server.masterhost);
7855 server.masterhost = sdsdup(c->argv[1]->ptr);
7856 server.masterport = atoi(c->argv[2]->ptr);
7857 if (server.master) freeClient(server.master);
7858 server.replstate = REDIS_REPL_CONNECT;
7859 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7860 server.masterhost, server.masterport);
7861 }
7862 addReply(c,shared.ok);
7863 }
7864
7865 /* ============================ Maxmemory directive ======================== */
7866
7867 /* Try to free one object form the pre-allocated objects free list.
7868 * This is useful under low mem conditions as by default we take 1 million
7869 * free objects allocated. On success REDIS_OK is returned, otherwise
7870 * REDIS_ERR. */
7871 static int tryFreeOneObjectFromFreelist(void) {
7872 robj *o;
7873
7874 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7875 if (listLength(server.objfreelist)) {
7876 listNode *head = listFirst(server.objfreelist);
7877 o = listNodeValue(head);
7878 listDelNode(server.objfreelist,head);
7879 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7880 zfree(o);
7881 return REDIS_OK;
7882 } else {
7883 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7884 return REDIS_ERR;
7885 }
7886 }
7887
7888 /* This function gets called when 'maxmemory' is set on the config file to limit
7889 * the max memory used by the server, and we are out of memory.
7890 * This function will try to, in order:
7891 *
7892 * - Free objects from the free list
7893 * - Try to remove keys with an EXPIRE set
7894 *
7895 * It is not possible to free enough memory to reach used-memory < maxmemory
7896 * the server will start refusing commands that will enlarge even more the
7897 * memory usage.
7898 */
7899 static void freeMemoryIfNeeded(void) {
7900 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7901 int j, k, freed = 0;
7902
7903 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7904 for (j = 0; j < server.dbnum; j++) {
7905 int minttl = -1;
7906 robj *minkey = NULL;
7907 struct dictEntry *de;
7908
7909 if (dictSize(server.db[j].expires)) {
7910 freed = 1;
7911 /* From a sample of three keys drop the one nearest to
7912 * the natural expire */
7913 for (k = 0; k < 3; k++) {
7914 time_t t;
7915
7916 de = dictGetRandomKey(server.db[j].expires);
7917 t = (time_t) dictGetEntryVal(de);
7918 if (minttl == -1 || t < minttl) {
7919 minkey = dictGetEntryKey(de);
7920 minttl = t;
7921 }
7922 }
7923 deleteKey(server.db+j,minkey);
7924 }
7925 }
7926 if (!freed) return; /* nothing to free... */
7927 }
7928 }
7929
7930 /* ============================== Append Only file ========================== */
7931
7932 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7933 sds buf = sdsempty();
7934 int j;
7935 ssize_t nwritten;
7936 time_t now;
7937 robj *tmpargv[3];
7938
7939 /* The DB this command was targetting is not the same as the last command
7940 * we appendend. To issue a SELECT command is needed. */
7941 if (dictid != server.appendseldb) {
7942 char seldb[64];
7943
7944 snprintf(seldb,sizeof(seldb),"%d",dictid);
7945 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7946 (unsigned long)strlen(seldb),seldb);
7947 server.appendseldb = dictid;
7948 }
7949
7950 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7951 * EXPIREs into EXPIREATs calls */
7952 if (cmd->proc == expireCommand) {
7953 long when;
7954
7955 tmpargv[0] = createStringObject("EXPIREAT",8);
7956 tmpargv[1] = argv[1];
7957 incrRefCount(argv[1]);
7958 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7959 tmpargv[2] = createObject(REDIS_STRING,
7960 sdscatprintf(sdsempty(),"%ld",when));
7961 argv = tmpargv;
7962 }
7963
7964 /* Append the actual command */
7965 buf = sdscatprintf(buf,"*%d\r\n",argc);
7966 for (j = 0; j < argc; j++) {
7967 robj *o = argv[j];
7968
7969 o = getDecodedObject(o);
7970 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7971 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7972 buf = sdscatlen(buf,"\r\n",2);
7973 decrRefCount(o);
7974 }
7975
7976 /* Free the objects from the modified argv for EXPIREAT */
7977 if (cmd->proc == expireCommand) {
7978 for (j = 0; j < 3; j++)
7979 decrRefCount(argv[j]);
7980 }
7981
7982 /* We want to perform a single write. This should be guaranteed atomic
7983 * at least if the filesystem we are writing is a real physical one.
7984 * While this will save us against the server being killed I don't think
7985 * there is much to do about the whole server stopping for power problems
7986 * or alike */
7987 nwritten = write(server.appendfd,buf,sdslen(buf));
7988 if (nwritten != (signed)sdslen(buf)) {
7989 /* Ooops, we are in troubles. The best thing to do for now is
7990 * to simply exit instead to give the illusion that everything is
7991 * working as expected. */
7992 if (nwritten == -1) {
7993 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7994 } else {
7995 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7996 }
7997 exit(1);
7998 }
7999 /* If a background append only file rewriting is in progress we want to
8000 * accumulate the differences between the child DB and the current one
8001 * in a buffer, so that when the child process will do its work we
8002 * can append the differences to the new append only file. */
8003 if (server.bgrewritechildpid != -1)
8004 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8005
8006 sdsfree(buf);
8007 now = time(NULL);
8008 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8009 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8010 now-server.lastfsync > 1))
8011 {
8012 fsync(server.appendfd); /* Let's try to get this data on the disk */
8013 server.lastfsync = now;
8014 }
8015 }
8016
8017 /* In Redis commands are always executed in the context of a client, so in
8018 * order to load the append only file we need to create a fake client. */
8019 static struct redisClient *createFakeClient(void) {
8020 struct redisClient *c = zmalloc(sizeof(*c));
8021
8022 selectDb(c,0);
8023 c->fd = -1;
8024 c->querybuf = sdsempty();
8025 c->argc = 0;
8026 c->argv = NULL;
8027 c->flags = 0;
8028 /* We set the fake client as a slave waiting for the synchronization
8029 * so that Redis will not try to send replies to this client. */
8030 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8031 c->reply = listCreate();
8032 listSetFreeMethod(c->reply,decrRefCount);
8033 listSetDupMethod(c->reply,dupClientReplyValue);
8034 return c;
8035 }
8036
8037 static void freeFakeClient(struct redisClient *c) {
8038 sdsfree(c->querybuf);
8039 listRelease(c->reply);
8040 zfree(c);
8041 }
8042
8043 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8044 * error (the append only file is zero-length) REDIS_ERR is returned. On
8045 * fatal error an error message is logged and the program exists. */
8046 int loadAppendOnlyFile(char *filename) {
8047 struct redisClient *fakeClient;
8048 FILE *fp = fopen(filename,"r");
8049 struct redis_stat sb;
8050 unsigned long long loadedkeys = 0;
8051
8052 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8053 return REDIS_ERR;
8054
8055 if (fp == NULL) {
8056 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8057 exit(1);
8058 }
8059
8060 fakeClient = createFakeClient();
8061 while(1) {
8062 int argc, j;
8063 unsigned long len;
8064 robj **argv;
8065 char buf[128];
8066 sds argsds;
8067 struct redisCommand *cmd;
8068
8069 if (fgets(buf,sizeof(buf),fp) == NULL) {
8070 if (feof(fp))
8071 break;
8072 else
8073 goto readerr;
8074 }
8075 if (buf[0] != '*') goto fmterr;
8076 argc = atoi(buf+1);
8077 argv = zmalloc(sizeof(robj*)*argc);
8078 for (j = 0; j < argc; j++) {
8079 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8080 if (buf[0] != '$') goto fmterr;
8081 len = strtol(buf+1,NULL,10);
8082 argsds = sdsnewlen(NULL,len);
8083 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8084 argv[j] = createObject(REDIS_STRING,argsds);
8085 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8086 }
8087
8088 /* Command lookup */
8089 cmd = lookupCommand(argv[0]->ptr);
8090 if (!cmd) {
8091 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8092 exit(1);
8093 }
8094 /* Try object encoding */
8095 if (cmd->flags & REDIS_CMD_BULK)
8096 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8097 /* Run the command in the context of a fake client */
8098 fakeClient->argc = argc;
8099 fakeClient->argv = argv;
8100 cmd->proc(fakeClient);
8101 /* Discard the reply objects list from the fake client */
8102 while(listLength(fakeClient->reply))
8103 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8104 /* Clean up, ready for the next command */
8105 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8106 zfree(argv);
8107 /* Handle swapping while loading big datasets when VM is on */
8108 loadedkeys++;
8109 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8110 while (zmalloc_used_memory() > server.vm_max_memory) {
8111 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8112 }
8113 }
8114 }
8115 fclose(fp);
8116 freeFakeClient(fakeClient);
8117 return REDIS_OK;
8118
8119 readerr:
8120 if (feof(fp)) {
8121 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8122 } else {
8123 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8124 }
8125 exit(1);
8126 fmterr:
8127 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8128 exit(1);
8129 }
8130
8131 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8132 static int fwriteBulkObject(FILE *fp, robj *obj) {
8133 char buf[128];
8134 int decrrc = 0;
8135
8136 /* Avoid the incr/decr ref count business if possible to help
8137 * copy-on-write (we are often in a child process when this function
8138 * is called).
8139 * Also makes sure that key objects don't get incrRefCount-ed when VM
8140 * is enabled */
8141 if (obj->encoding != REDIS_ENCODING_RAW) {
8142 obj = getDecodedObject(obj);
8143 decrrc = 1;
8144 }
8145 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8146 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8147 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8148 goto err;
8149 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8150 if (decrrc) decrRefCount(obj);
8151 return 1;
8152 err:
8153 if (decrrc) decrRefCount(obj);
8154 return 0;
8155 }
8156
8157 /* Write binary-safe string into a file in the bulkformat
8158 * $<count>\r\n<payload>\r\n */
8159 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8160 char buf[128];
8161
8162 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8163 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8164 if (len && fwrite(s,len,1,fp) == 0) return 0;
8165 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8166 return 1;
8167 }
8168
8169 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8170 static int fwriteBulkDouble(FILE *fp, double d) {
8171 char buf[128], dbuf[128];
8172
8173 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8174 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8175 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8176 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8177 return 1;
8178 }
8179
8180 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8181 static int fwriteBulkLong(FILE *fp, long l) {
8182 char buf[128], lbuf[128];
8183
8184 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8185 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8186 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8187 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8188 return 1;
8189 }
8190
8191 /* Write a sequence of commands able to fully rebuild the dataset into
8192 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8193 static int rewriteAppendOnlyFile(char *filename) {
8194 dictIterator *di = NULL;
8195 dictEntry *de;
8196 FILE *fp;
8197 char tmpfile[256];
8198 int j;
8199 time_t now = time(NULL);
8200
8201 /* Note that we have to use a different temp name here compared to the
8202 * one used by rewriteAppendOnlyFileBackground() function. */
8203 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8204 fp = fopen(tmpfile,"w");
8205 if (!fp) {
8206 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8207 return REDIS_ERR;
8208 }
8209 for (j = 0; j < server.dbnum; j++) {
8210 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8211 redisDb *db = server.db+j;
8212 dict *d = db->dict;
8213 if (dictSize(d) == 0) continue;
8214 di = dictGetIterator(d);
8215 if (!di) {
8216 fclose(fp);
8217 return REDIS_ERR;
8218 }
8219
8220 /* SELECT the new DB */
8221 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8222 if (fwriteBulkLong(fp,j) == 0) goto werr;
8223
8224 /* Iterate this DB writing every entry */
8225 while((de = dictNext(di)) != NULL) {
8226 robj *key, *o;
8227 time_t expiretime;
8228 int swapped;
8229
8230 key = dictGetEntryKey(de);
8231 /* If the value for this key is swapped, load a preview in memory.
8232 * We use a "swapped" flag to remember if we need to free the
8233 * value object instead to just increment the ref count anyway
8234 * in order to avoid copy-on-write of pages if we are forked() */
8235 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8236 key->storage == REDIS_VM_SWAPPING) {
8237 o = dictGetEntryVal(de);
8238 swapped = 0;
8239 } else {
8240 o = vmPreviewObject(key);
8241 swapped = 1;
8242 }
8243 expiretime = getExpire(db,key);
8244
8245 /* Save the key and associated value */
8246 if (o->type == REDIS_STRING) {
8247 /* Emit a SET command */
8248 char cmd[]="*3\r\n$3\r\nSET\r\n";
8249 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8250 /* Key and value */
8251 if (fwriteBulkObject(fp,key) == 0) goto werr;
8252 if (fwriteBulkObject(fp,o) == 0) goto werr;
8253 } else if (o->type == REDIS_LIST) {
8254 /* Emit the RPUSHes needed to rebuild the list */
8255 list *list = o->ptr;
8256 listNode *ln;
8257 listIter li;
8258
8259 listRewind(list,&li);
8260 while((ln = listNext(&li))) {
8261 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8262 robj *eleobj = listNodeValue(ln);
8263
8264 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8265 if (fwriteBulkObject(fp,key) == 0) goto werr;
8266 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8267 }
8268 } else if (o->type == REDIS_SET) {
8269 /* Emit the SADDs needed to rebuild the set */
8270 dict *set = o->ptr;
8271 dictIterator *di = dictGetIterator(set);
8272 dictEntry *de;
8273
8274 while((de = dictNext(di)) != NULL) {
8275 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8276 robj *eleobj = dictGetEntryKey(de);
8277
8278 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8279 if (fwriteBulkObject(fp,key) == 0) goto werr;
8280 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8281 }
8282 dictReleaseIterator(di);
8283 } else if (o->type == REDIS_ZSET) {
8284 /* Emit the ZADDs needed to rebuild the sorted set */
8285 zset *zs = o->ptr;
8286 dictIterator *di = dictGetIterator(zs->dict);
8287 dictEntry *de;
8288
8289 while((de = dictNext(di)) != NULL) {
8290 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8291 robj *eleobj = dictGetEntryKey(de);
8292 double *score = dictGetEntryVal(de);
8293
8294 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8295 if (fwriteBulkObject(fp,key) == 0) goto werr;
8296 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8297 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8298 }
8299 dictReleaseIterator(di);
8300 } else if (o->type == REDIS_HASH) {
8301 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8302
8303 /* Emit the HSETs needed to rebuild the hash */
8304 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8305 unsigned char *p = zipmapRewind(o->ptr);
8306 unsigned char *field, *val;
8307 unsigned int flen, vlen;
8308
8309 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8310 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8311 if (fwriteBulkObject(fp,key) == 0) goto werr;
8312 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8313 return -1;
8314 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8315 return -1;
8316 }
8317 } else {
8318 dictIterator *di = dictGetIterator(o->ptr);
8319 dictEntry *de;
8320
8321 while((de = dictNext(di)) != NULL) {
8322 robj *field = dictGetEntryKey(de);
8323 robj *val = dictGetEntryVal(de);
8324
8325 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8326 if (fwriteBulkObject(fp,key) == 0) goto werr;
8327 if (fwriteBulkObject(fp,field) == -1) return -1;
8328 if (fwriteBulkObject(fp,val) == -1) return -1;
8329 }
8330 dictReleaseIterator(di);
8331 }
8332 } else {
8333 redisPanic("Unknown object type");
8334 }
8335 /* Save the expire time */
8336 if (expiretime != -1) {
8337 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8338 /* If this key is already expired skip it */
8339 if (expiretime < now) continue;
8340 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8341 if (fwriteBulkObject(fp,key) == 0) goto werr;
8342 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8343 }
8344 if (swapped) decrRefCount(o);
8345 }
8346 dictReleaseIterator(di);
8347 }
8348
8349 /* Make sure data will not remain on the OS's output buffers */
8350 fflush(fp);
8351 fsync(fileno(fp));
8352 fclose(fp);
8353
8354 /* Use RENAME to make sure the DB file is changed atomically only
8355 * if the generate DB file is ok. */
8356 if (rename(tmpfile,filename) == -1) {
8357 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8358 unlink(tmpfile);
8359 return REDIS_ERR;
8360 }
8361 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8362 return REDIS_OK;
8363
8364 werr:
8365 fclose(fp);
8366 unlink(tmpfile);
8367 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8368 if (di) dictReleaseIterator(di);
8369 return REDIS_ERR;
8370 }
8371
8372 /* This is how rewriting of the append only file in background works:
8373 *
8374 * 1) The user calls BGREWRITEAOF
8375 * 2) Redis calls this function, that forks():
8376 * 2a) the child rewrite the append only file in a temp file.
8377 * 2b) the parent accumulates differences in server.bgrewritebuf.
8378 * 3) When the child finished '2a' exists.
8379 * 4) The parent will trap the exit code, if it's OK, will append the
8380 * data accumulated into server.bgrewritebuf into the temp file, and
8381 * finally will rename(2) the temp file in the actual file name.
8382 * The the new file is reopened as the new append only file. Profit!
8383 */
8384 static int rewriteAppendOnlyFileBackground(void) {
8385 pid_t childpid;
8386
8387 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8388 if (server.vm_enabled) waitEmptyIOJobsQueue();
8389 if ((childpid = fork()) == 0) {
8390 /* Child */
8391 char tmpfile[256];
8392
8393 if (server.vm_enabled) vmReopenSwapFile();
8394 close(server.fd);
8395 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8396 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8397 _exit(0);
8398 } else {
8399 _exit(1);
8400 }
8401 } else {
8402 /* Parent */
8403 if (childpid == -1) {
8404 redisLog(REDIS_WARNING,
8405 "Can't rewrite append only file in background: fork: %s",
8406 strerror(errno));
8407 return REDIS_ERR;
8408 }
8409 redisLog(REDIS_NOTICE,
8410 "Background append only file rewriting started by pid %d",childpid);
8411 server.bgrewritechildpid = childpid;
8412 updateDictResizePolicy();
8413 /* We set appendseldb to -1 in order to force the next call to the
8414 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8415 * accumulated by the parent into server.bgrewritebuf will start
8416 * with a SELECT statement and it will be safe to merge. */
8417 server.appendseldb = -1;
8418 return REDIS_OK;
8419 }
8420 return REDIS_OK; /* unreached */
8421 }
8422
8423 static void bgrewriteaofCommand(redisClient *c) {
8424 if (server.bgrewritechildpid != -1) {
8425 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8426 return;
8427 }
8428 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8429 char *status = "+Background append only file rewriting started\r\n";
8430 addReplySds(c,sdsnew(status));
8431 } else {
8432 addReply(c,shared.err);
8433 }
8434 }
8435
8436 static void aofRemoveTempFile(pid_t childpid) {
8437 char tmpfile[256];
8438
8439 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8440 unlink(tmpfile);
8441 }
8442
8443 /* Virtual Memory is composed mainly of two subsystems:
8444 * - Blocking Virutal Memory
8445 * - Threaded Virtual Memory I/O
8446 * The two parts are not fully decoupled, but functions are split among two
8447 * different sections of the source code (delimited by comments) in order to
8448 * make more clear what functionality is about the blocking VM and what about
8449 * the threaded (not blocking) VM.
8450 *
8451 * Redis VM design:
8452 *
8453 * Redis VM is a blocking VM (one that blocks reading swapped values from
8454 * disk into memory when a value swapped out is needed in memory) that is made
8455 * unblocking by trying to examine the command argument vector in order to
8456 * load in background values that will likely be needed in order to exec
8457 * the command. The command is executed only once all the relevant keys
8458 * are loaded into memory.
8459 *
8460 * This basically is almost as simple of a blocking VM, but almost as parallel
8461 * as a fully non-blocking VM.
8462 */
8463
8464 /* =================== Virtual Memory - Blocking Side ====================== */
8465
8466 /* substitute the first occurrence of '%p' with the process pid in the
8467 * swap file name. */
8468 static void expandVmSwapFilename(void) {
8469 char *p = strstr(server.vm_swap_file,"%p");
8470 sds new;
8471
8472 if (!p) return;
8473 new = sdsempty();
8474 *p = '\0';
8475 new = sdscat(new,server.vm_swap_file);
8476 new = sdscatprintf(new,"%ld",(long) getpid());
8477 new = sdscat(new,p+2);
8478 zfree(server.vm_swap_file);
8479 server.vm_swap_file = new;
8480 }
8481
8482 static void vmInit(void) {
8483 off_t totsize;
8484 int pipefds[2];
8485 size_t stacksize;
8486
8487 if (server.vm_max_threads != 0)
8488 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8489
8490 expandVmSwapFilename();
8491 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8492 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8493 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8494 }
8495 if (server.vm_fp == NULL) {
8496 redisLog(REDIS_WARNING,
8497 "Impossible to open the swap file: %s. Exiting.",
8498 strerror(errno));
8499 exit(1);
8500 }
8501 server.vm_fd = fileno(server.vm_fp);
8502 server.vm_next_page = 0;
8503 server.vm_near_pages = 0;
8504 server.vm_stats_used_pages = 0;
8505 server.vm_stats_swapped_objects = 0;
8506 server.vm_stats_swapouts = 0;
8507 server.vm_stats_swapins = 0;
8508 totsize = server.vm_pages*server.vm_page_size;
8509 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8510 if (ftruncate(server.vm_fd,totsize) == -1) {
8511 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8512 strerror(errno));
8513 exit(1);
8514 } else {
8515 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8516 }
8517 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8518 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8519 (long long) (server.vm_pages+7)/8, server.vm_pages);
8520 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8521
8522 /* Initialize threaded I/O (used by Virtual Memory) */
8523 server.io_newjobs = listCreate();
8524 server.io_processing = listCreate();
8525 server.io_processed = listCreate();
8526 server.io_ready_clients = listCreate();
8527 pthread_mutex_init(&server.io_mutex,NULL);
8528 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8529 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8530 server.io_active_threads = 0;
8531 if (pipe(pipefds) == -1) {
8532 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8533 ,strerror(errno));
8534 exit(1);
8535 }
8536 server.io_ready_pipe_read = pipefds[0];
8537 server.io_ready_pipe_write = pipefds[1];
8538 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8539 /* LZF requires a lot of stack */
8540 pthread_attr_init(&server.io_threads_attr);
8541 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8542 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8543 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8544 /* Listen for events in the threaded I/O pipe */
8545 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8546 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8547 oom("creating file event");
8548 }
8549
8550 /* Mark the page as used */
8551 static void vmMarkPageUsed(off_t page) {
8552 off_t byte = page/8;
8553 int bit = page&7;
8554 redisAssert(vmFreePage(page) == 1);
8555 server.vm_bitmap[byte] |= 1<<bit;
8556 }
8557
8558 /* Mark N contiguous pages as used, with 'page' being the first. */
8559 static void vmMarkPagesUsed(off_t page, off_t count) {
8560 off_t j;
8561
8562 for (j = 0; j < count; j++)
8563 vmMarkPageUsed(page+j);
8564 server.vm_stats_used_pages += count;
8565 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8566 (long long)count, (long long)page);
8567 }
8568
8569 /* Mark the page as free */
8570 static void vmMarkPageFree(off_t page) {
8571 off_t byte = page/8;
8572 int bit = page&7;
8573 redisAssert(vmFreePage(page) == 0);
8574 server.vm_bitmap[byte] &= ~(1<<bit);
8575 }
8576
8577 /* Mark N contiguous pages as free, with 'page' being the first. */
8578 static void vmMarkPagesFree(off_t page, off_t count) {
8579 off_t j;
8580
8581 for (j = 0; j < count; j++)
8582 vmMarkPageFree(page+j);
8583 server.vm_stats_used_pages -= count;
8584 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8585 (long long)count, (long long)page);
8586 }
8587
8588 /* Test if the page is free */
8589 static int vmFreePage(off_t page) {
8590 off_t byte = page/8;
8591 int bit = page&7;
8592 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8593 }
8594
8595 /* Find N contiguous free pages storing the first page of the cluster in *first.
8596 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8597 * REDIS_ERR is returned.
8598 *
8599 * This function uses a simple algorithm: we try to allocate
8600 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8601 * again from the start of the swap file searching for free spaces.
8602 *
8603 * If it looks pretty clear that there are no free pages near our offset
8604 * we try to find less populated places doing a forward jump of
8605 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8606 * without hurry, and then we jump again and so forth...
8607 *
8608 * This function can be improved using a free list to avoid to guess
8609 * too much, since we could collect data about freed pages.
8610 *
8611 * note: I implemented this function just after watching an episode of
8612 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8613 */
8614 static int vmFindContiguousPages(off_t *first, off_t n) {
8615 off_t base, offset = 0, since_jump = 0, numfree = 0;
8616
8617 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8618 server.vm_near_pages = 0;
8619 server.vm_next_page = 0;
8620 }
8621 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8622 base = server.vm_next_page;
8623
8624 while(offset < server.vm_pages) {
8625 off_t this = base+offset;
8626
8627 /* If we overflow, restart from page zero */
8628 if (this >= server.vm_pages) {
8629 this -= server.vm_pages;
8630 if (this == 0) {
8631 /* Just overflowed, what we found on tail is no longer
8632 * interesting, as it's no longer contiguous. */
8633 numfree = 0;
8634 }
8635 }
8636 if (vmFreePage(this)) {
8637 /* This is a free page */
8638 numfree++;
8639 /* Already got N free pages? Return to the caller, with success */
8640 if (numfree == n) {
8641 *first = this-(n-1);
8642 server.vm_next_page = this+1;
8643 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8644 return REDIS_OK;
8645 }
8646 } else {
8647 /* The current one is not a free page */
8648 numfree = 0;
8649 }
8650
8651 /* Fast-forward if the current page is not free and we already
8652 * searched enough near this place. */
8653 since_jump++;
8654 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8655 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8656 since_jump = 0;
8657 /* Note that even if we rewind after the jump, we are don't need
8658 * to make sure numfree is set to zero as we only jump *if* it
8659 * is set to zero. */
8660 } else {
8661 /* Otherwise just check the next page */
8662 offset++;
8663 }
8664 }
8665 return REDIS_ERR;
8666 }
8667
8668 /* Write the specified object at the specified page of the swap file */
8669 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8670 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8671 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8672 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8673 redisLog(REDIS_WARNING,
8674 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8675 strerror(errno));
8676 return REDIS_ERR;
8677 }
8678 rdbSaveObject(server.vm_fp,o);
8679 fflush(server.vm_fp);
8680 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8681 return REDIS_OK;
8682 }
8683
8684 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8685 * needed to later retrieve the object into the key object.
8686 * If we can't find enough contiguous empty pages to swap the object on disk
8687 * REDIS_ERR is returned. */
8688 static int vmSwapObjectBlocking(robj *key, robj *val) {
8689 off_t pages = rdbSavedObjectPages(val,NULL);
8690 off_t page;
8691
8692 assert(key->storage == REDIS_VM_MEMORY);
8693 assert(key->refcount == 1);
8694 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8695 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8696 key->vm.page = page;
8697 key->vm.usedpages = pages;
8698 key->storage = REDIS_VM_SWAPPED;
8699 key->vtype = val->type;
8700 decrRefCount(val); /* Deallocate the object from memory. */
8701 vmMarkPagesUsed(page,pages);
8702 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8703 (unsigned char*) key->ptr,
8704 (unsigned long long) page, (unsigned long long) pages);
8705 server.vm_stats_swapped_objects++;
8706 server.vm_stats_swapouts++;
8707 return REDIS_OK;
8708 }
8709
8710 static robj *vmReadObjectFromSwap(off_t page, int type) {
8711 robj *o;
8712
8713 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8714 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8715 redisLog(REDIS_WARNING,
8716 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8717 strerror(errno));
8718 _exit(1);
8719 }
8720 o = rdbLoadObject(type,server.vm_fp);
8721 if (o == NULL) {
8722 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8723 _exit(1);
8724 }
8725 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8726 return o;
8727 }
8728
8729 /* Load the value object relative to the 'key' object from swap to memory.
8730 * The newly allocated object is returned.
8731 *
8732 * If preview is true the unserialized object is returned to the caller but
8733 * no changes are made to the key object, nor the pages are marked as freed */
8734 static robj *vmGenericLoadObject(robj *key, int preview) {
8735 robj *val;
8736
8737 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8738 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8739 if (!preview) {
8740 key->storage = REDIS_VM_MEMORY;
8741 key->vm.atime = server.unixtime;
8742 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8743 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8744 (unsigned char*) key->ptr);
8745 server.vm_stats_swapped_objects--;
8746 } else {
8747 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8748 (unsigned char*) key->ptr);
8749 }
8750 server.vm_stats_swapins++;
8751 return val;
8752 }
8753
8754 /* Plain object loading, from swap to memory */
8755 static robj *vmLoadObject(robj *key) {
8756 /* If we are loading the object in background, stop it, we
8757 * need to load this object synchronously ASAP. */
8758 if (key->storage == REDIS_VM_LOADING)
8759 vmCancelThreadedIOJob(key);
8760 return vmGenericLoadObject(key,0);
8761 }
8762
8763 /* Just load the value on disk, without to modify the key.
8764 * This is useful when we want to perform some operation on the value
8765 * without to really bring it from swap to memory, like while saving the
8766 * dataset or rewriting the append only log. */
8767 static robj *vmPreviewObject(robj *key) {
8768 return vmGenericLoadObject(key,1);
8769 }
8770
8771 /* How a good candidate is this object for swapping?
8772 * The better candidate it is, the greater the returned value.
8773 *
8774 * Currently we try to perform a fast estimation of the object size in
8775 * memory, and combine it with aging informations.
8776 *
8777 * Basically swappability = idle-time * log(estimated size)
8778 *
8779 * Bigger objects are preferred over smaller objects, but not
8780 * proportionally, this is why we use the logarithm. This algorithm is
8781 * just a first try and will probably be tuned later. */
8782 static double computeObjectSwappability(robj *o) {
8783 time_t age = server.unixtime - o->vm.atime;
8784 long asize = 0;
8785 list *l;
8786 dict *d;
8787 struct dictEntry *de;
8788 int z;
8789
8790 if (age <= 0) return 0;
8791 switch(o->type) {
8792 case REDIS_STRING:
8793 if (o->encoding != REDIS_ENCODING_RAW) {
8794 asize = sizeof(*o);
8795 } else {
8796 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8797 }
8798 break;
8799 case REDIS_LIST:
8800 l = o->ptr;
8801 listNode *ln = listFirst(l);
8802
8803 asize = sizeof(list);
8804 if (ln) {
8805 robj *ele = ln->value;
8806 long elesize;
8807
8808 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8809 (sizeof(*o)+sdslen(ele->ptr)) :
8810 sizeof(*o);
8811 asize += (sizeof(listNode)+elesize)*listLength(l);
8812 }
8813 break;
8814 case REDIS_SET:
8815 case REDIS_ZSET:
8816 z = (o->type == REDIS_ZSET);
8817 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8818
8819 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8820 if (z) asize += sizeof(zset)-sizeof(dict);
8821 if (dictSize(d)) {
8822 long elesize;
8823 robj *ele;
8824
8825 de = dictGetRandomKey(d);
8826 ele = dictGetEntryKey(de);
8827 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8828 (sizeof(*o)+sdslen(ele->ptr)) :
8829 sizeof(*o);
8830 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8831 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8832 }
8833 break;
8834 case REDIS_HASH:
8835 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8836 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8837 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8838 unsigned int klen, vlen;
8839 unsigned char *key, *val;
8840
8841 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8842 klen = 0;
8843 vlen = 0;
8844 }
8845 asize = len*(klen+vlen+3);
8846 } else if (o->encoding == REDIS_ENCODING_HT) {
8847 d = o->ptr;
8848 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8849 if (dictSize(d)) {
8850 long elesize;
8851 robj *ele;
8852
8853 de = dictGetRandomKey(d);
8854 ele = dictGetEntryKey(de);
8855 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8856 (sizeof(*o)+sdslen(ele->ptr)) :
8857 sizeof(*o);
8858 ele = dictGetEntryVal(de);
8859 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8860 (sizeof(*o)+sdslen(ele->ptr)) :
8861 sizeof(*o);
8862 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8863 }
8864 }
8865 break;
8866 }
8867 return (double)age*log(1+asize);
8868 }
8869
8870 /* Try to swap an object that's a good candidate for swapping.
8871 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8872 * to swap any object at all.
8873 *
8874 * If 'usethreaded' is true, Redis will try to swap the object in background
8875 * using I/O threads. */
8876 static int vmSwapOneObject(int usethreads) {
8877 int j, i;
8878 struct dictEntry *best = NULL;
8879 double best_swappability = 0;
8880 redisDb *best_db = NULL;
8881 robj *key, *val;
8882
8883 for (j = 0; j < server.dbnum; j++) {
8884 redisDb *db = server.db+j;
8885 /* Why maxtries is set to 100?
8886 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8887 * are swappable objects */
8888 int maxtries = 100;
8889
8890 if (dictSize(db->dict) == 0) continue;
8891 for (i = 0; i < 5; i++) {
8892 dictEntry *de;
8893 double swappability;
8894
8895 if (maxtries) maxtries--;
8896 de = dictGetRandomKey(db->dict);
8897 key = dictGetEntryKey(de);
8898 val = dictGetEntryVal(de);
8899 /* Only swap objects that are currently in memory.
8900 *
8901 * Also don't swap shared objects if threaded VM is on, as we
8902 * try to ensure that the main thread does not touch the
8903 * object while the I/O thread is using it, but we can't
8904 * control other keys without adding additional mutex. */
8905 if (key->storage != REDIS_VM_MEMORY ||
8906 (server.vm_max_threads != 0 && val->refcount != 1)) {
8907 if (maxtries) i--; /* don't count this try */
8908 continue;
8909 }
8910 swappability = computeObjectSwappability(val);
8911 if (!best || swappability > best_swappability) {
8912 best = de;
8913 best_swappability = swappability;
8914 best_db = db;
8915 }
8916 }
8917 }
8918 if (best == NULL) return REDIS_ERR;
8919 key = dictGetEntryKey(best);
8920 val = dictGetEntryVal(best);
8921
8922 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8923 key->ptr, best_swappability);
8924
8925 /* Unshare the key if needed */
8926 if (key->refcount > 1) {
8927 robj *newkey = dupStringObject(key);
8928 decrRefCount(key);
8929 key = dictGetEntryKey(best) = newkey;
8930 }
8931 /* Swap it */
8932 if (usethreads) {
8933 vmSwapObjectThreaded(key,val,best_db);
8934 return REDIS_OK;
8935 } else {
8936 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8937 dictGetEntryVal(best) = NULL;
8938 return REDIS_OK;
8939 } else {
8940 return REDIS_ERR;
8941 }
8942 }
8943 }
8944
8945 static int vmSwapOneObjectBlocking() {
8946 return vmSwapOneObject(0);
8947 }
8948
8949 static int vmSwapOneObjectThreaded() {
8950 return vmSwapOneObject(1);
8951 }
8952
8953 /* Return true if it's safe to swap out objects in a given moment.
8954 * Basically we don't want to swap objects out while there is a BGSAVE
8955 * or a BGAEOREWRITE running in backgroud. */
8956 static int vmCanSwapOut(void) {
8957 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8958 }
8959
8960 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8961 * and was deleted. Otherwise 0 is returned. */
8962 static int deleteIfSwapped(redisDb *db, robj *key) {
8963 dictEntry *de;
8964 robj *foundkey;
8965
8966 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8967 foundkey = dictGetEntryKey(de);
8968 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8969 deleteKey(db,key);
8970 return 1;
8971 }
8972
8973 /* =================== Virtual Memory - Threaded I/O ======================= */
8974
8975 static void freeIOJob(iojob *j) {
8976 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8977 j->type == REDIS_IOJOB_DO_SWAP ||
8978 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8979 decrRefCount(j->val);
8980 /* We don't decrRefCount the j->key field as we did't incremented
8981 * the count creating IO Jobs. This is because the key field here is
8982 * just used as an indentifier and if a key is removed the Job should
8983 * never be touched again. */
8984 zfree(j);
8985 }
8986
8987 /* Every time a thread finished a Job, it writes a byte into the write side
8988 * of an unix pipe in order to "awake" the main thread, and this function
8989 * is called. */
8990 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8991 int mask)
8992 {
8993 char buf[1];
8994 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8995 REDIS_NOTUSED(el);
8996 REDIS_NOTUSED(mask);
8997 REDIS_NOTUSED(privdata);
8998
8999 /* For every byte we read in the read side of the pipe, there is one
9000 * I/O job completed to process. */
9001 while((retval = read(fd,buf,1)) == 1) {
9002 iojob *j;
9003 listNode *ln;
9004 robj *key;
9005 struct dictEntry *de;
9006
9007 redisLog(REDIS_DEBUG,"Processing I/O completed job");
9008
9009 /* Get the processed element (the oldest one) */
9010 lockThreadedIO();
9011 assert(listLength(server.io_processed) != 0);
9012 if (toprocess == -1) {
9013 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9014 if (toprocess <= 0) toprocess = 1;
9015 }
9016 ln = listFirst(server.io_processed);
9017 j = ln->value;
9018 listDelNode(server.io_processed,ln);
9019 unlockThreadedIO();
9020 /* If this job is marked as canceled, just ignore it */
9021 if (j->canceled) {
9022 freeIOJob(j);
9023 continue;
9024 }
9025 /* Post process it in the main thread, as there are things we
9026 * can do just here to avoid race conditions and/or invasive locks */
9027 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
9028 de = dictFind(j->db->dict,j->key);
9029 assert(de != NULL);
9030 key = dictGetEntryKey(de);
9031 if (j->type == REDIS_IOJOB_LOAD) {
9032 redisDb *db;
9033
9034 /* Key loaded, bring it at home */
9035 key->storage = REDIS_VM_MEMORY;
9036 key->vm.atime = server.unixtime;
9037 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9038 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9039 (unsigned char*) key->ptr);
9040 server.vm_stats_swapped_objects--;
9041 server.vm_stats_swapins++;
9042 dictGetEntryVal(de) = j->val;
9043 incrRefCount(j->val);
9044 db = j->db;
9045 freeIOJob(j);
9046 /* Handle clients waiting for this key to be loaded. */
9047 handleClientsBlockedOnSwappedKey(db,key);
9048 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9049 /* Now we know the amount of pages required to swap this object.
9050 * Let's find some space for it, and queue this task again
9051 * rebranded as REDIS_IOJOB_DO_SWAP. */
9052 if (!vmCanSwapOut() ||
9053 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9054 {
9055 /* Ooops... no space or we can't swap as there is
9056 * a fork()ed Redis trying to save stuff on disk. */
9057 freeIOJob(j);
9058 key->storage = REDIS_VM_MEMORY; /* undo operation */
9059 } else {
9060 /* Note that we need to mark this pages as used now,
9061 * if the job will be canceled, we'll mark them as freed
9062 * again. */
9063 vmMarkPagesUsed(j->page,j->pages);
9064 j->type = REDIS_IOJOB_DO_SWAP;
9065 lockThreadedIO();
9066 queueIOJob(j);
9067 unlockThreadedIO();
9068 }
9069 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9070 robj *val;
9071
9072 /* Key swapped. We can finally free some memory. */
9073 if (key->storage != REDIS_VM_SWAPPING) {
9074 printf("key->storage: %d\n",key->storage);
9075 printf("key->name: %s\n",(char*)key->ptr);
9076 printf("key->refcount: %d\n",key->refcount);
9077 printf("val: %p\n",(void*)j->val);
9078 printf("val->type: %d\n",j->val->type);
9079 printf("val->ptr: %s\n",(char*)j->val->ptr);
9080 }
9081 redisAssert(key->storage == REDIS_VM_SWAPPING);
9082 val = dictGetEntryVal(de);
9083 key->vm.page = j->page;
9084 key->vm.usedpages = j->pages;
9085 key->storage = REDIS_VM_SWAPPED;
9086 key->vtype = j->val->type;
9087 decrRefCount(val); /* Deallocate the object from memory. */
9088 dictGetEntryVal(de) = NULL;
9089 redisLog(REDIS_DEBUG,
9090 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9091 (unsigned char*) key->ptr,
9092 (unsigned long long) j->page, (unsigned long long) j->pages);
9093 server.vm_stats_swapped_objects++;
9094 server.vm_stats_swapouts++;
9095 freeIOJob(j);
9096 /* Put a few more swap requests in queue if we are still
9097 * out of memory */
9098 if (trytoswap && vmCanSwapOut() &&
9099 zmalloc_used_memory() > server.vm_max_memory)
9100 {
9101 int more = 1;
9102 while(more) {
9103 lockThreadedIO();
9104 more = listLength(server.io_newjobs) <
9105 (unsigned) server.vm_max_threads;
9106 unlockThreadedIO();
9107 /* Don't waste CPU time if swappable objects are rare. */
9108 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9109 trytoswap = 0;
9110 break;
9111 }
9112 }
9113 }
9114 }
9115 processed++;
9116 if (processed == toprocess) return;
9117 }
9118 if (retval < 0 && errno != EAGAIN) {
9119 redisLog(REDIS_WARNING,
9120 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9121 strerror(errno));
9122 }
9123 }
9124
9125 static void lockThreadedIO(void) {
9126 pthread_mutex_lock(&server.io_mutex);
9127 }
9128
9129 static void unlockThreadedIO(void) {
9130 pthread_mutex_unlock(&server.io_mutex);
9131 }
9132
9133 /* Remove the specified object from the threaded I/O queue if still not
9134 * processed, otherwise make sure to flag it as canceled. */
9135 static void vmCancelThreadedIOJob(robj *o) {
9136 list *lists[3] = {
9137 server.io_newjobs, /* 0 */
9138 server.io_processing, /* 1 */
9139 server.io_processed /* 2 */
9140 };
9141 int i;
9142
9143 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9144 again:
9145 lockThreadedIO();
9146 /* Search for a matching key in one of the queues */
9147 for (i = 0; i < 3; i++) {
9148 listNode *ln;
9149 listIter li;
9150
9151 listRewind(lists[i],&li);
9152 while ((ln = listNext(&li)) != NULL) {
9153 iojob *job = ln->value;
9154
9155 if (job->canceled) continue; /* Skip this, already canceled. */
9156 if (job->key == o) {
9157 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9158 (void*)job, (char*)o->ptr, job->type, i);
9159 /* Mark the pages as free since the swap didn't happened
9160 * or happened but is now discarded. */
9161 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9162 vmMarkPagesFree(job->page,job->pages);
9163 /* Cancel the job. It depends on the list the job is
9164 * living in. */
9165 switch(i) {
9166 case 0: /* io_newjobs */
9167 /* If the job was yet not processed the best thing to do
9168 * is to remove it from the queue at all */
9169 freeIOJob(job);
9170 listDelNode(lists[i],ln);
9171 break;
9172 case 1: /* io_processing */
9173 /* Oh Shi- the thread is messing with the Job:
9174 *
9175 * Probably it's accessing the object if this is a
9176 * PREPARE_SWAP or DO_SWAP job.
9177 * If it's a LOAD job it may be reading from disk and
9178 * if we don't wait for the job to terminate before to
9179 * cancel it, maybe in a few microseconds data can be
9180 * corrupted in this pages. So the short story is:
9181 *
9182 * Better to wait for the job to move into the
9183 * next queue (processed)... */
9184
9185 /* We try again and again until the job is completed. */
9186 unlockThreadedIO();
9187 /* But let's wait some time for the I/O thread
9188 * to finish with this job. After all this condition
9189 * should be very rare. */
9190 usleep(1);
9191 goto again;
9192 case 2: /* io_processed */
9193 /* The job was already processed, that's easy...
9194 * just mark it as canceled so that we'll ignore it
9195 * when processing completed jobs. */
9196 job->canceled = 1;
9197 break;
9198 }
9199 /* Finally we have to adjust the storage type of the object
9200 * in order to "UNDO" the operaiton. */
9201 if (o->storage == REDIS_VM_LOADING)
9202 o->storage = REDIS_VM_SWAPPED;
9203 else if (o->storage == REDIS_VM_SWAPPING)
9204 o->storage = REDIS_VM_MEMORY;
9205 unlockThreadedIO();
9206 return;
9207 }
9208 }
9209 }
9210 unlockThreadedIO();
9211 assert(1 != 1); /* We should never reach this */
9212 }
9213
9214 static void *IOThreadEntryPoint(void *arg) {
9215 iojob *j;
9216 listNode *ln;
9217 REDIS_NOTUSED(arg);
9218
9219 pthread_detach(pthread_self());
9220 while(1) {
9221 /* Get a new job to process */
9222 lockThreadedIO();
9223 if (listLength(server.io_newjobs) == 0) {
9224 /* No new jobs in queue, exit. */
9225 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9226 (long) pthread_self());
9227 server.io_active_threads--;
9228 unlockThreadedIO();
9229 return NULL;
9230 }
9231 ln = listFirst(server.io_newjobs);
9232 j = ln->value;
9233 listDelNode(server.io_newjobs,ln);
9234 /* Add the job in the processing queue */
9235 j->thread = pthread_self();
9236 listAddNodeTail(server.io_processing,j);
9237 ln = listLast(server.io_processing); /* We use ln later to remove it */
9238 unlockThreadedIO();
9239 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9240 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9241
9242 /* Process the Job */
9243 if (j->type == REDIS_IOJOB_LOAD) {
9244 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9245 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9246 FILE *fp = fopen("/dev/null","w+");
9247 j->pages = rdbSavedObjectPages(j->val,fp);
9248 fclose(fp);
9249 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9250 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9251 j->canceled = 1;
9252 }
9253
9254 /* Done: insert the job into the processed queue */
9255 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9256 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9257 lockThreadedIO();
9258 listDelNode(server.io_processing,ln);
9259 listAddNodeTail(server.io_processed,j);
9260 unlockThreadedIO();
9261
9262 /* Signal the main thread there is new stuff to process */
9263 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9264 }
9265 return NULL; /* never reached */
9266 }
9267
9268 static void spawnIOThread(void) {
9269 pthread_t thread;
9270 sigset_t mask, omask;
9271 int err;
9272
9273 sigemptyset(&mask);
9274 sigaddset(&mask,SIGCHLD);
9275 sigaddset(&mask,SIGHUP);
9276 sigaddset(&mask,SIGPIPE);
9277 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9278 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9279 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9280 strerror(err));
9281 usleep(1000000);
9282 }
9283 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9284 server.io_active_threads++;
9285 }
9286
9287 /* We need to wait for the last thread to exit before we are able to
9288 * fork() in order to BGSAVE or BGREWRITEAOF. */
9289 static void waitEmptyIOJobsQueue(void) {
9290 while(1) {
9291 int io_processed_len;
9292
9293 lockThreadedIO();
9294 if (listLength(server.io_newjobs) == 0 &&
9295 listLength(server.io_processing) == 0 &&
9296 server.io_active_threads == 0)
9297 {
9298 unlockThreadedIO();
9299 return;
9300 }
9301 /* While waiting for empty jobs queue condition we post-process some
9302 * finshed job, as I/O threads may be hanging trying to write against
9303 * the io_ready_pipe_write FD but there are so much pending jobs that
9304 * it's blocking. */
9305 io_processed_len = listLength(server.io_processed);
9306 unlockThreadedIO();
9307 if (io_processed_len) {
9308 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9309 usleep(1000); /* 1 millisecond */
9310 } else {
9311 usleep(10000); /* 10 milliseconds */
9312 }
9313 }
9314 }
9315
9316 static void vmReopenSwapFile(void) {
9317 /* Note: we don't close the old one as we are in the child process
9318 * and don't want to mess at all with the original file object. */
9319 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9320 if (server.vm_fp == NULL) {
9321 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9322 server.vm_swap_file);
9323 _exit(1);
9324 }
9325 server.vm_fd = fileno(server.vm_fp);
9326 }
9327
9328 /* This function must be called while with threaded IO locked */
9329 static void queueIOJob(iojob *j) {
9330 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9331 (void*)j, j->type, (char*)j->key->ptr);
9332 listAddNodeTail(server.io_newjobs,j);
9333 if (server.io_active_threads < server.vm_max_threads)
9334 spawnIOThread();
9335 }
9336
9337 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9338 iojob *j;
9339
9340 assert(key->storage == REDIS_VM_MEMORY);
9341 assert(key->refcount == 1);
9342
9343 j = zmalloc(sizeof(*j));
9344 j->type = REDIS_IOJOB_PREPARE_SWAP;
9345 j->db = db;
9346 j->key = key;
9347 j->val = val;
9348 incrRefCount(val);
9349 j->canceled = 0;
9350 j->thread = (pthread_t) -1;
9351 key->storage = REDIS_VM_SWAPPING;
9352
9353 lockThreadedIO();
9354 queueIOJob(j);
9355 unlockThreadedIO();
9356 return REDIS_OK;
9357 }
9358
9359 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9360
9361 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9362 * If there is not already a job loading the key, it is craeted.
9363 * The key is added to the io_keys list in the client structure, and also
9364 * in the hash table mapping swapped keys to waiting clients, that is,
9365 * server.io_waited_keys. */
9366 static int waitForSwappedKey(redisClient *c, robj *key) {
9367 struct dictEntry *de;
9368 robj *o;
9369 list *l;
9370
9371 /* If the key does not exist or is already in RAM we don't need to
9372 * block the client at all. */
9373 de = dictFind(c->db->dict,key);
9374 if (de == NULL) return 0;
9375 o = dictGetEntryKey(de);
9376 if (o->storage == REDIS_VM_MEMORY) {
9377 return 0;
9378 } else if (o->storage == REDIS_VM_SWAPPING) {
9379 /* We were swapping the key, undo it! */
9380 vmCancelThreadedIOJob(o);
9381 return 0;
9382 }
9383
9384 /* OK: the key is either swapped, or being loaded just now. */
9385
9386 /* Add the key to the list of keys this client is waiting for.
9387 * This maps clients to keys they are waiting for. */
9388 listAddNodeTail(c->io_keys,key);
9389 incrRefCount(key);
9390
9391 /* Add the client to the swapped keys => clients waiting map. */
9392 de = dictFind(c->db->io_keys,key);
9393 if (de == NULL) {
9394 int retval;
9395
9396 /* For every key we take a list of clients blocked for it */
9397 l = listCreate();
9398 retval = dictAdd(c->db->io_keys,key,l);
9399 incrRefCount(key);
9400 assert(retval == DICT_OK);
9401 } else {
9402 l = dictGetEntryVal(de);
9403 }
9404 listAddNodeTail(l,c);
9405
9406 /* Are we already loading the key from disk? If not create a job */
9407 if (o->storage == REDIS_VM_SWAPPED) {
9408 iojob *j;
9409
9410 o->storage = REDIS_VM_LOADING;
9411 j = zmalloc(sizeof(*j));
9412 j->type = REDIS_IOJOB_LOAD;
9413 j->db = c->db;
9414 j->key = o;
9415 j->key->vtype = o->vtype;
9416 j->page = o->vm.page;
9417 j->val = NULL;
9418 j->canceled = 0;
9419 j->thread = (pthread_t) -1;
9420 lockThreadedIO();
9421 queueIOJob(j);
9422 unlockThreadedIO();
9423 }
9424 return 1;
9425 }
9426
9427 /* Preload keys needed for the ZUNION and ZINTER commands. */
9428 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9429 int i, num;
9430 num = atoi(c->argv[2]->ptr);
9431 for (i = 0; i < num; i++) {
9432 waitForSwappedKey(c,c->argv[3+i]);
9433 }
9434 }
9435
9436 /* Is this client attempting to run a command against swapped keys?
9437 * If so, block it ASAP, load the keys in background, then resume it.
9438 *
9439 * The important idea about this function is that it can fail! If keys will
9440 * still be swapped when the client is resumed, this key lookups will
9441 * just block loading keys from disk. In practical terms this should only
9442 * happen with SORT BY command or if there is a bug in this function.
9443 *
9444 * Return 1 if the client is marked as blocked, 0 if the client can
9445 * continue as the keys it is going to access appear to be in memory. */
9446 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9447 int j, last;
9448
9449 if (cmd->vm_preload_proc != NULL) {
9450 cmd->vm_preload_proc(c);
9451 } else {
9452 if (cmd->vm_firstkey == 0) return 0;
9453 last = cmd->vm_lastkey;
9454 if (last < 0) last = c->argc+last;
9455 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9456 waitForSwappedKey(c,c->argv[j]);
9457 }
9458
9459 /* If the client was blocked for at least one key, mark it as blocked. */
9460 if (listLength(c->io_keys)) {
9461 c->flags |= REDIS_IO_WAIT;
9462 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9463 server.vm_blocked_clients++;
9464 return 1;
9465 } else {
9466 return 0;
9467 }
9468 }
9469
9470 /* Remove the 'key' from the list of blocked keys for a given client.
9471 *
9472 * The function returns 1 when there are no longer blocking keys after
9473 * the current one was removed (and the client can be unblocked). */
9474 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9475 list *l;
9476 listNode *ln;
9477 listIter li;
9478 struct dictEntry *de;
9479
9480 /* Remove the key from the list of keys this client is waiting for. */
9481 listRewind(c->io_keys,&li);
9482 while ((ln = listNext(&li)) != NULL) {
9483 if (compareStringObjects(ln->value,key) == 0) {
9484 listDelNode(c->io_keys,ln);
9485 break;
9486 }
9487 }
9488 assert(ln != NULL);
9489
9490 /* Remove the client form the key => waiting clients map. */
9491 de = dictFind(c->db->io_keys,key);
9492 assert(de != NULL);
9493 l = dictGetEntryVal(de);
9494 ln = listSearchKey(l,c);
9495 assert(ln != NULL);
9496 listDelNode(l,ln);
9497 if (listLength(l) == 0)
9498 dictDelete(c->db->io_keys,key);
9499
9500 return listLength(c->io_keys) == 0;
9501 }
9502
9503 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9504 struct dictEntry *de;
9505 list *l;
9506 listNode *ln;
9507 int len;
9508
9509 de = dictFind(db->io_keys,key);
9510 if (!de) return;
9511
9512 l = dictGetEntryVal(de);
9513 len = listLength(l);
9514 /* Note: we can't use something like while(listLength(l)) as the list
9515 * can be freed by the calling function when we remove the last element. */
9516 while (len--) {
9517 ln = listFirst(l);
9518 redisClient *c = ln->value;
9519
9520 if (dontWaitForSwappedKey(c,key)) {
9521 /* Put the client in the list of clients ready to go as we
9522 * loaded all the keys about it. */
9523 listAddNodeTail(server.io_ready_clients,c);
9524 }
9525 }
9526 }
9527
9528 /* =========================== Remote Configuration ========================= */
9529
9530 static void configSetCommand(redisClient *c) {
9531 robj *o = getDecodedObject(c->argv[3]);
9532 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9533 zfree(server.dbfilename);
9534 server.dbfilename = zstrdup(o->ptr);
9535 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9536 zfree(server.requirepass);
9537 server.requirepass = zstrdup(o->ptr);
9538 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9539 zfree(server.masterauth);
9540 server.masterauth = zstrdup(o->ptr);
9541 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9542 server.maxmemory = strtoll(o->ptr, NULL, 10);
9543 } else {
9544 addReplySds(c,sdscatprintf(sdsempty(),
9545 "-ERR not supported CONFIG parameter %s\r\n",
9546 (char*)c->argv[2]->ptr));
9547 decrRefCount(o);
9548 return;
9549 }
9550 decrRefCount(o);
9551 addReply(c,shared.ok);
9552 }
9553
9554 static void configGetCommand(redisClient *c) {
9555 robj *o = getDecodedObject(c->argv[2]);
9556 robj *lenobj = createObject(REDIS_STRING,NULL);
9557 char *pattern = o->ptr;
9558 int matches = 0;
9559
9560 addReply(c,lenobj);
9561 decrRefCount(lenobj);
9562
9563 if (stringmatch(pattern,"dbfilename",0)) {
9564 addReplyBulkCString(c,"dbfilename");
9565 addReplyBulkCString(c,server.dbfilename);
9566 matches++;
9567 }
9568 if (stringmatch(pattern,"requirepass",0)) {
9569 addReplyBulkCString(c,"requirepass");
9570 addReplyBulkCString(c,server.requirepass);
9571 matches++;
9572 }
9573 if (stringmatch(pattern,"masterauth",0)) {
9574 addReplyBulkCString(c,"masterauth");
9575 addReplyBulkCString(c,server.masterauth);
9576 matches++;
9577 }
9578 if (stringmatch(pattern,"maxmemory",0)) {
9579 char buf[128];
9580
9581 snprintf(buf,128,"%llu\n",server.maxmemory);
9582 addReplyBulkCString(c,"maxmemory");
9583 addReplyBulkCString(c,buf);
9584 matches++;
9585 }
9586 decrRefCount(o);
9587 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9588 }
9589
9590 static void configCommand(redisClient *c) {
9591 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9592 if (c->argc != 4) goto badarity;
9593 configSetCommand(c);
9594 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9595 if (c->argc != 3) goto badarity;
9596 configGetCommand(c);
9597 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9598 if (c->argc != 2) goto badarity;
9599 server.stat_numcommands = 0;
9600 server.stat_numconnections = 0;
9601 server.stat_expiredkeys = 0;
9602 server.stat_starttime = time(NULL);
9603 addReply(c,shared.ok);
9604 } else {
9605 addReplySds(c,sdscatprintf(sdsempty(),
9606 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9607 }
9608 return;
9609
9610 badarity:
9611 addReplySds(c,sdscatprintf(sdsempty(),
9612 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9613 (char*) c->argv[1]->ptr));
9614 }
9615
9616 /* =========================== Pubsub implementation ======================== */
9617
9618 static void freePubsubPattern(void *p) {
9619 pubsubPattern *pat = p;
9620
9621 decrRefCount(pat->pattern);
9622 zfree(pat);
9623 }
9624
9625 static int listMatchPubsubPattern(void *a, void *b) {
9626 pubsubPattern *pa = a, *pb = b;
9627
9628 return (pa->client == pb->client) &&
9629 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9630 }
9631
9632 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9633 * 0 if the client was already subscribed to that channel. */
9634 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9635 struct dictEntry *de;
9636 list *clients = NULL;
9637 int retval = 0;
9638
9639 /* Add the channel to the client -> channels hash table */
9640 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9641 retval = 1;
9642 incrRefCount(channel);
9643 /* Add the client to the channel -> list of clients hash table */
9644 de = dictFind(server.pubsub_channels,channel);
9645 if (de == NULL) {
9646 clients = listCreate();
9647 dictAdd(server.pubsub_channels,channel,clients);
9648 incrRefCount(channel);
9649 } else {
9650 clients = dictGetEntryVal(de);
9651 }
9652 listAddNodeTail(clients,c);
9653 }
9654 /* Notify the client */
9655 addReply(c,shared.mbulk3);
9656 addReply(c,shared.subscribebulk);
9657 addReplyBulk(c,channel);
9658 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9659 return retval;
9660 }
9661
9662 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9663 * 0 if the client was not subscribed to the specified channel. */
9664 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9665 struct dictEntry *de;
9666 list *clients;
9667 listNode *ln;
9668 int retval = 0;
9669
9670 /* Remove the channel from the client -> channels hash table */
9671 incrRefCount(channel); /* channel may be just a pointer to the same object
9672 we have in the hash tables. Protect it... */
9673 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9674 retval = 1;
9675 /* Remove the client from the channel -> clients list hash table */
9676 de = dictFind(server.pubsub_channels,channel);
9677 assert(de != NULL);
9678 clients = dictGetEntryVal(de);
9679 ln = listSearchKey(clients,c);
9680 assert(ln != NULL);
9681 listDelNode(clients,ln);
9682 if (listLength(clients) == 0) {
9683 /* Free the list and associated hash entry at all if this was
9684 * the latest client, so that it will be possible to abuse
9685 * Redis PUBSUB creating millions of channels. */
9686 dictDelete(server.pubsub_channels,channel);
9687 }
9688 }
9689 /* Notify the client */
9690 if (notify) {
9691 addReply(c,shared.mbulk3);
9692 addReply(c,shared.unsubscribebulk);
9693 addReplyBulk(c,channel);
9694 addReplyLong(c,dictSize(c->pubsub_channels)+
9695 listLength(c->pubsub_patterns));
9696
9697 }
9698 decrRefCount(channel); /* it is finally safe to release it */
9699 return retval;
9700 }
9701
9702 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9703 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9704 int retval = 0;
9705
9706 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9707 retval = 1;
9708 pubsubPattern *pat;
9709 listAddNodeTail(c->pubsub_patterns,pattern);
9710 incrRefCount(pattern);
9711 pat = zmalloc(sizeof(*pat));
9712 pat->pattern = getDecodedObject(pattern);
9713 pat->client = c;
9714 listAddNodeTail(server.pubsub_patterns,pat);
9715 }
9716 /* Notify the client */
9717 addReply(c,shared.mbulk3);
9718 addReply(c,shared.psubscribebulk);
9719 addReplyBulk(c,pattern);
9720 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9721 return retval;
9722 }
9723
9724 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9725 * 0 if the client was not subscribed to the specified channel. */
9726 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9727 listNode *ln;
9728 pubsubPattern pat;
9729 int retval = 0;
9730
9731 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9732 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9733 retval = 1;
9734 listDelNode(c->pubsub_patterns,ln);
9735 pat.client = c;
9736 pat.pattern = pattern;
9737 ln = listSearchKey(server.pubsub_patterns,&pat);
9738 listDelNode(server.pubsub_patterns,ln);
9739 }
9740 /* Notify the client */
9741 if (notify) {
9742 addReply(c,shared.mbulk3);
9743 addReply(c,shared.punsubscribebulk);
9744 addReplyBulk(c,pattern);
9745 addReplyLong(c,dictSize(c->pubsub_channels)+
9746 listLength(c->pubsub_patterns));
9747 }
9748 decrRefCount(pattern);
9749 return retval;
9750 }
9751
9752 /* Unsubscribe from all the channels. Return the number of channels the
9753 * client was subscribed from. */
9754 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9755 dictIterator *di = dictGetIterator(c->pubsub_channels);
9756 dictEntry *de;
9757 int count = 0;
9758
9759 while((de = dictNext(di)) != NULL) {
9760 robj *channel = dictGetEntryKey(de);
9761
9762 count += pubsubUnsubscribeChannel(c,channel,notify);
9763 }
9764 dictReleaseIterator(di);
9765 return count;
9766 }
9767
9768 /* Unsubscribe from all the patterns. Return the number of patterns the
9769 * client was subscribed from. */
9770 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9771 listNode *ln;
9772 listIter li;
9773 int count = 0;
9774
9775 listRewind(c->pubsub_patterns,&li);
9776 while ((ln = listNext(&li)) != NULL) {
9777 robj *pattern = ln->value;
9778
9779 count += pubsubUnsubscribePattern(c,pattern,notify);
9780 }
9781 return count;
9782 }
9783
9784 /* Publish a message */
9785 static int pubsubPublishMessage(robj *channel, robj *message) {
9786 int receivers = 0;
9787 struct dictEntry *de;
9788 listNode *ln;
9789 listIter li;
9790
9791 /* Send to clients listening for that channel */
9792 de = dictFind(server.pubsub_channels,channel);
9793 if (de) {
9794 list *list = dictGetEntryVal(de);
9795 listNode *ln;
9796 listIter li;
9797
9798 listRewind(list,&li);
9799 while ((ln = listNext(&li)) != NULL) {
9800 redisClient *c = ln->value;
9801
9802 addReply(c,shared.mbulk3);
9803 addReply(c,shared.messagebulk);
9804 addReplyBulk(c,channel);
9805 addReplyBulk(c,message);
9806 receivers++;
9807 }
9808 }
9809 /* Send to clients listening to matching channels */
9810 if (listLength(server.pubsub_patterns)) {
9811 listRewind(server.pubsub_patterns,&li);
9812 channel = getDecodedObject(channel);
9813 while ((ln = listNext(&li)) != NULL) {
9814 pubsubPattern *pat = ln->value;
9815
9816 if (stringmatchlen((char*)pat->pattern->ptr,
9817 sdslen(pat->pattern->ptr),
9818 (char*)channel->ptr,
9819 sdslen(channel->ptr),0)) {
9820 addReply(pat->client,shared.mbulk3);
9821 addReply(pat->client,shared.messagebulk);
9822 addReplyBulk(pat->client,channel);
9823 addReplyBulk(pat->client,message);
9824 receivers++;
9825 }
9826 }
9827 decrRefCount(channel);
9828 }
9829 return receivers;
9830 }
9831
9832 static void subscribeCommand(redisClient *c) {
9833 int j;
9834
9835 for (j = 1; j < c->argc; j++)
9836 pubsubSubscribeChannel(c,c->argv[j]);
9837 }
9838
9839 static void unsubscribeCommand(redisClient *c) {
9840 if (c->argc == 1) {
9841 pubsubUnsubscribeAllChannels(c,1);
9842 return;
9843 } else {
9844 int j;
9845
9846 for (j = 1; j < c->argc; j++)
9847 pubsubUnsubscribeChannel(c,c->argv[j],1);
9848 }
9849 }
9850
9851 static void psubscribeCommand(redisClient *c) {
9852 int j;
9853
9854 for (j = 1; j < c->argc; j++)
9855 pubsubSubscribePattern(c,c->argv[j]);
9856 }
9857
9858 static void punsubscribeCommand(redisClient *c) {
9859 if (c->argc == 1) {
9860 pubsubUnsubscribeAllPatterns(c,1);
9861 return;
9862 } else {
9863 int j;
9864
9865 for (j = 1; j < c->argc; j++)
9866 pubsubUnsubscribePattern(c,c->argv[j],1);
9867 }
9868 }
9869
9870 static void publishCommand(redisClient *c) {
9871 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9872 addReplyLong(c,receivers);
9873 }
9874
9875 /* ================================= Debugging ============================== */
9876
9877 static void debugCommand(redisClient *c) {
9878 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9879 *((char*)-1) = 'x';
9880 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9881 if (rdbSave(server.dbfilename) != REDIS_OK) {
9882 addReply(c,shared.err);
9883 return;
9884 }
9885 emptyDb();
9886 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9887 addReply(c,shared.err);
9888 return;
9889 }
9890 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9891 addReply(c,shared.ok);
9892 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9893 emptyDb();
9894 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9895 addReply(c,shared.err);
9896 return;
9897 }
9898 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9899 addReply(c,shared.ok);
9900 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9901 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9902 robj *key, *val;
9903
9904 if (!de) {
9905 addReply(c,shared.nokeyerr);
9906 return;
9907 }
9908 key = dictGetEntryKey(de);
9909 val = dictGetEntryVal(de);
9910 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9911 key->storage == REDIS_VM_SWAPPING)) {
9912 char *strenc;
9913 char buf[128];
9914
9915 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9916 strenc = strencoding[val->encoding];
9917 } else {
9918 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9919 strenc = buf;
9920 }
9921 addReplySds(c,sdscatprintf(sdsempty(),
9922 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9923 "encoding:%s serializedlength:%lld\r\n",
9924 (void*)key, key->refcount, (void*)val, val->refcount,
9925 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9926 } else {
9927 addReplySds(c,sdscatprintf(sdsempty(),
9928 "+Key at:%p refcount:%d, value swapped at: page %llu "
9929 "using %llu pages\r\n",
9930 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9931 (unsigned long long) key->vm.usedpages));
9932 }
9933 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9934 lookupKeyRead(c->db,c->argv[2]);
9935 addReply(c,shared.ok);
9936 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9937 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9938 robj *key, *val;
9939
9940 if (!server.vm_enabled) {
9941 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9942 return;
9943 }
9944 if (!de) {
9945 addReply(c,shared.nokeyerr);
9946 return;
9947 }
9948 key = dictGetEntryKey(de);
9949 val = dictGetEntryVal(de);
9950 /* If the key is shared we want to create a copy */
9951 if (key->refcount > 1) {
9952 robj *newkey = dupStringObject(key);
9953 decrRefCount(key);
9954 key = dictGetEntryKey(de) = newkey;
9955 }
9956 /* Swap it */
9957 if (key->storage != REDIS_VM_MEMORY) {
9958 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9959 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9960 dictGetEntryVal(de) = NULL;
9961 addReply(c,shared.ok);
9962 } else {
9963 addReply(c,shared.err);
9964 }
9965 } else {
9966 addReplySds(c,sdsnew(
9967 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9968 }
9969 }
9970
9971 static void _redisAssert(char *estr, char *file, int line) {
9972 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9973 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9974 #ifdef HAVE_BACKTRACE
9975 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9976 *((char*)-1) = 'x';
9977 #endif
9978 }
9979
9980 static void _redisPanic(char *msg, char *file, int line) {
9981 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
9982 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
9983 #ifdef HAVE_BACKTRACE
9984 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9985 *((char*)-1) = 'x';
9986 #endif
9987 }
9988
9989 /* =================================== Main! ================================ */
9990
9991 #ifdef __linux__
9992 int linuxOvercommitMemoryValue(void) {
9993 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9994 char buf[64];
9995
9996 if (!fp) return -1;
9997 if (fgets(buf,64,fp) == NULL) {
9998 fclose(fp);
9999 return -1;
10000 }
10001 fclose(fp);
10002
10003 return atoi(buf);
10004 }
10005
10006 void linuxOvercommitMemoryWarning(void) {
10007 if (linuxOvercommitMemoryValue() == 0) {
10008 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
10009 }
10010 }
10011 #endif /* __linux__ */
10012
10013 static void daemonize(void) {
10014 int fd;
10015 FILE *fp;
10016
10017 if (fork() != 0) exit(0); /* parent exits */
10018 setsid(); /* create a new session */
10019
10020 /* Every output goes to /dev/null. If Redis is daemonized but
10021 * the 'logfile' is set to 'stdout' in the configuration file
10022 * it will not log at all. */
10023 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10024 dup2(fd, STDIN_FILENO);
10025 dup2(fd, STDOUT_FILENO);
10026 dup2(fd, STDERR_FILENO);
10027 if (fd > STDERR_FILENO) close(fd);
10028 }
10029 /* Try to write the pid file */
10030 fp = fopen(server.pidfile,"w");
10031 if (fp) {
10032 fprintf(fp,"%d\n",getpid());
10033 fclose(fp);
10034 }
10035 }
10036
10037 static void version() {
10038 printf("Redis server version %s\n", REDIS_VERSION);
10039 exit(0);
10040 }
10041
10042 static void usage() {
10043 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10044 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10045 exit(1);
10046 }
10047
10048 int main(int argc, char **argv) {
10049 time_t start;
10050
10051 initServerConfig();
10052 if (argc == 2) {
10053 if (strcmp(argv[1], "-v") == 0 ||
10054 strcmp(argv[1], "--version") == 0) version();
10055 if (strcmp(argv[1], "--help") == 0) usage();
10056 resetServerSaveParams();
10057 loadServerConfig(argv[1]);
10058 } else if ((argc > 2)) {
10059 usage();
10060 } else {
10061 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10062 }
10063 if (server.daemonize) daemonize();
10064 initServer();
10065 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10066 #ifdef __linux__
10067 linuxOvercommitMemoryWarning();
10068 #endif
10069 start = time(NULL);
10070 if (server.appendonly) {
10071 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10072 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10073 } else {
10074 if (rdbLoad(server.dbfilename) == REDIS_OK)
10075 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10076 }
10077 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10078 aeSetBeforeSleepProc(server.el,beforeSleep);
10079 aeMain(server.el);
10080 aeDeleteEventLoop(server.el);
10081 return 0;
10082 }
10083
10084 /* ============================= Backtrace support ========================= */
10085
10086 #ifdef HAVE_BACKTRACE
10087 static char *findFuncName(void *pointer, unsigned long *offset);
10088
10089 static void *getMcontextEip(ucontext_t *uc) {
10090 #if defined(__FreeBSD__)
10091 return (void*) uc->uc_mcontext.mc_eip;
10092 #elif defined(__dietlibc__)
10093 return (void*) uc->uc_mcontext.eip;
10094 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10095 #if __x86_64__
10096 return (void*) uc->uc_mcontext->__ss.__rip;
10097 #else
10098 return (void*) uc->uc_mcontext->__ss.__eip;
10099 #endif
10100 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10101 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10102 return (void*) uc->uc_mcontext->__ss.__rip;
10103 #else
10104 return (void*) uc->uc_mcontext->__ss.__eip;
10105 #endif
10106 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10107 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10108 #elif defined(__ia64__) /* Linux IA64 */
10109 return (void*) uc->uc_mcontext.sc_ip;
10110 #else
10111 return NULL;
10112 #endif
10113 }
10114
10115 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10116 void *trace[100];
10117 char **messages = NULL;
10118 int i, trace_size = 0;
10119 unsigned long offset=0;
10120 ucontext_t *uc = (ucontext_t*) secret;
10121 sds infostring;
10122 REDIS_NOTUSED(info);
10123
10124 redisLog(REDIS_WARNING,
10125 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10126 infostring = genRedisInfoString();
10127 redisLog(REDIS_WARNING, "%s",infostring);
10128 /* It's not safe to sdsfree() the returned string under memory
10129 * corruption conditions. Let it leak as we are going to abort */
10130
10131 trace_size = backtrace(trace, 100);
10132 /* overwrite sigaction with caller's address */
10133 if (getMcontextEip(uc) != NULL) {
10134 trace[1] = getMcontextEip(uc);
10135 }
10136 messages = backtrace_symbols(trace, trace_size);
10137
10138 for (i=1; i<trace_size; ++i) {
10139 char *fn = findFuncName(trace[i], &offset), *p;
10140
10141 p = strchr(messages[i],'+');
10142 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10143 redisLog(REDIS_WARNING,"%s", messages[i]);
10144 } else {
10145 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10146 }
10147 }
10148 /* free(messages); Don't call free() with possibly corrupted memory. */
10149 _exit(0);
10150 }
10151
10152 static void setupSigSegvAction(void) {
10153 struct sigaction act;
10154
10155 sigemptyset (&act.sa_mask);
10156 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10157 * is used. Otherwise, sa_handler is used */
10158 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10159 act.sa_sigaction = segvHandler;
10160 sigaction (SIGSEGV, &act, NULL);
10161 sigaction (SIGBUS, &act, NULL);
10162 sigaction (SIGFPE, &act, NULL);
10163 sigaction (SIGILL, &act, NULL);
10164 sigaction (SIGBUS, &act, NULL);
10165 return;
10166 }
10167
10168 #include "staticsymbols.h"
10169 /* This function try to convert a pointer into a function name. It's used in
10170 * oreder to provide a backtrace under segmentation fault that's able to
10171 * display functions declared as static (otherwise the backtrace is useless). */
10172 static char *findFuncName(void *pointer, unsigned long *offset){
10173 int i, ret = -1;
10174 unsigned long off, minoff = 0;
10175
10176 /* Try to match against the Symbol with the smallest offset */
10177 for (i=0; symsTable[i].pointer; i++) {
10178 unsigned long lp = (unsigned long) pointer;
10179
10180 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10181 off=lp-symsTable[i].pointer;
10182 if (ret < 0 || off < minoff) {
10183 minoff=off;
10184 ret=i;
10185 }
10186 }
10187 }
10188 if (ret == -1) return NULL;
10189 *offset = minoff;
10190 return symsTable[ret].name;
10191 }
10192 #else /* HAVE_BACKTRACE */
10193 static void setupSigSegvAction(void) {
10194 }
10195 #endif /* HAVE_BACKTRACE */
10196
10197
10198
10199 /* The End */
10200
10201
10202