]> git.saurik.com Git - redis.git/blob - redis.c
de4f5763d79acb07a0f00572d6d06ee2a9b77f84
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114 #define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
115
116 /* Object types */
117 #define REDIS_STRING 0
118 #define REDIS_LIST 1
119 #define REDIS_SET 2
120 #define REDIS_ZSET 3
121 #define REDIS_HASH 4
122
123 /* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
126 #define REDIS_ENCODING_RAW 0 /* Raw representation */
127 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
128 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
130
131 static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133 };
134
135 /* Object types only used for dumping to disk */
136 #define REDIS_EXPIRETIME 253
137 #define REDIS_SELECTDB 254
138 #define REDIS_EOF 255
139
140 /* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
150 *
151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
153 #define REDIS_RDB_6BITLEN 0
154 #define REDIS_RDB_14BITLEN 1
155 #define REDIS_RDB_32BITLEN 2
156 #define REDIS_RDB_ENCVAL 3
157 #define REDIS_RDB_LENERR UINT_MAX
158
159 /* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
165 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
166
167 /* Virtual memory object->where field. */
168 #define REDIS_VM_MEMORY 0 /* The object is on memory */
169 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
170 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
173 /* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175 #define REDIS_VM_MAX_NEAR_PAGES 65536
176 #define REDIS_VM_MAX_RANDOM_JUMP 4096
177 #define REDIS_VM_MAX_THREADS 32
178 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 /* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
183 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
184
185 /* Client flags */
186 #define REDIS_SLAVE 1 /* This client is a slave server */
187 #define REDIS_MASTER 2 /* This client is a master server */
188 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189 #define REDIS_MULTI 8 /* This client is in a MULTI context */
190 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
192
193 /* Slave replication state - slave side */
194 #define REDIS_REPL_NONE 0 /* No active replication */
195 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
196 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
198 /* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
207 /* List related stuff */
208 #define REDIS_HEAD 0
209 #define REDIS_TAIL 1
210
211 /* Sort operations */
212 #define REDIS_SORT_GET 0
213 #define REDIS_SORT_ASC 1
214 #define REDIS_SORT_DESC 2
215 #define REDIS_SORTKEY_MAX 1024
216
217 /* Log levels */
218 #define REDIS_DEBUG 0
219 #define REDIS_VERBOSE 1
220 #define REDIS_NOTICE 2
221 #define REDIS_WARNING 3
222
223 /* Anti-warning macro... */
224 #define REDIS_NOTUSED(V) ((void) V)
225
226 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
228
229 /* Append only defines */
230 #define APPENDFSYNC_NO 0
231 #define APPENDFSYNC_ALWAYS 1
232 #define APPENDFSYNC_EVERYSEC 2
233
234 /* Hashes related defaults */
235 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
238 /* We can print the stacktrace, so our assert is defined this way: */
239 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240 #define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
241 static void _redisAssert(char *estr, char *file, int line);
242 static void _redisPanic(char *msg, char *file, int line);
243
244 /*================================= Data types ============================== */
245
246 /* A redis object, that is a type able to hold a string / list / set */
247
248 /* The VM object structure */
249 struct redisObjectVM {
250 off_t page; /* the page at witch the object is stored on disk */
251 off_t usedpages; /* number of pages used on disk */
252 time_t atime; /* Last access time */
253 } vm;
254
255 /* The actual Redis Object */
256 typedef struct redisObject {
257 void *ptr;
258 unsigned char type;
259 unsigned char encoding;
260 unsigned char storage; /* If this object is a key, where is the value?
261 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
262 unsigned char vtype; /* If this object is a key, and value is swapped out,
263 * this is the type of the swapped out object. */
264 int refcount;
265 /* VM fields, this are only allocated if VM is active, otherwise the
266 * object allocation function will just allocate
267 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
268 * Redis without VM active will not have any overhead. */
269 struct redisObjectVM vm;
270 } robj;
271
272 /* Macro used to initalize a Redis object allocated on the stack.
273 * Note that this macro is taken near the structure definition to make sure
274 * we'll update it when the structure is changed, to avoid bugs like
275 * bug #85 introduced exactly in this way. */
276 #define initStaticStringObject(_var,_ptr) do { \
277 _var.refcount = 1; \
278 _var.type = REDIS_STRING; \
279 _var.encoding = REDIS_ENCODING_RAW; \
280 _var.ptr = _ptr; \
281 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
282 } while(0);
283
284 typedef struct redisDb {
285 dict *dict; /* The keyspace for this DB */
286 dict *expires; /* Timeout of keys with a timeout set */
287 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
288 dict *io_keys; /* Keys with clients waiting for VM I/O */
289 int id;
290 } redisDb;
291
292 /* Client MULTI/EXEC state */
293 typedef struct multiCmd {
294 robj **argv;
295 int argc;
296 struct redisCommand *cmd;
297 } multiCmd;
298
299 typedef struct multiState {
300 multiCmd *commands; /* Array of MULTI commands */
301 int count; /* Total number of MULTI commands */
302 } multiState;
303
304 /* With multiplexing we need to take per-clinet state.
305 * Clients are taken in a liked list. */
306 typedef struct redisClient {
307 int fd;
308 redisDb *db;
309 int dictid;
310 sds querybuf;
311 robj **argv, **mbargv;
312 int argc, mbargc;
313 int bulklen; /* bulk read len. -1 if not in bulk read mode */
314 int multibulk; /* multi bulk command format active */
315 list *reply;
316 int sentlen;
317 time_t lastinteraction; /* time of the last interaction, used for timeout */
318 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
319 int slaveseldb; /* slave selected db, if this client is a slave */
320 int authenticated; /* when requirepass is non-NULL */
321 int replstate; /* replication state if this is a slave */
322 int repldbfd; /* replication DB file descriptor */
323 long repldboff; /* replication DB file offset */
324 off_t repldbsize; /* replication DB file size */
325 multiState mstate; /* MULTI/EXEC state */
326 robj **blockingkeys; /* The key we are waiting to terminate a blocking
327 * operation such as BLPOP. Otherwise NULL. */
328 int blockingkeysnum; /* Number of blocking keys */
329 time_t blockingto; /* Blocking operation timeout. If UNIX current time
330 * is >= blockingto then the operation timed out. */
331 list *io_keys; /* Keys this client is waiting to be loaded from the
332 * swap file in order to continue. */
333 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
334 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
335 } redisClient;
336
337 struct saveparam {
338 time_t seconds;
339 int changes;
340 };
341
342 /* Global server state structure */
343 struct redisServer {
344 int port;
345 int fd;
346 redisDb *db;
347 long long dirty; /* changes to DB from the last save */
348 list *clients;
349 list *slaves, *monitors;
350 char neterr[ANET_ERR_LEN];
351 aeEventLoop *el;
352 int cronloops; /* number of times the cron function run */
353 list *objfreelist; /* A list of freed objects to avoid malloc() */
354 time_t lastsave; /* Unix time of last save succeeede */
355 /* Fields used only for stats */
356 time_t stat_starttime; /* server start time */
357 long long stat_numcommands; /* number of processed commands */
358 long long stat_numconnections; /* number of connections received */
359 long long stat_expiredkeys; /* number of expired keys */
360 /* Configuration */
361 int verbosity;
362 int glueoutputbuf;
363 int maxidletime;
364 int dbnum;
365 int daemonize;
366 int appendonly;
367 int appendfsync;
368 time_t lastfsync;
369 int appendfd;
370 int appendseldb;
371 char *pidfile;
372 pid_t bgsavechildpid;
373 pid_t bgrewritechildpid;
374 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
375 struct saveparam *saveparams;
376 int saveparamslen;
377 char *logfile;
378 char *bindaddr;
379 char *dbfilename;
380 char *appendfilename;
381 char *requirepass;
382 int shareobjects;
383 int rdbcompression;
384 int activerehashing;
385 /* Replication related */
386 int isslave;
387 char *masterauth;
388 char *masterhost;
389 int masterport;
390 redisClient *master; /* client that is master for this slave */
391 int replstate;
392 unsigned int maxclients;
393 unsigned long long maxmemory;
394 unsigned int blpop_blocked_clients;
395 unsigned int vm_blocked_clients;
396 /* Sort parameters - qsort_r() is only available under BSD so we
397 * have to take this state global, in order to pass it to sortCompare() */
398 int sort_desc;
399 int sort_alpha;
400 int sort_bypattern;
401 /* Virtual memory configuration */
402 int vm_enabled;
403 char *vm_swap_file;
404 off_t vm_page_size;
405 off_t vm_pages;
406 unsigned long long vm_max_memory;
407 /* Hashes config */
408 size_t hash_max_zipmap_entries;
409 size_t hash_max_zipmap_value;
410 /* Virtual memory state */
411 FILE *vm_fp;
412 int vm_fd;
413 off_t vm_next_page; /* Next probably empty page */
414 off_t vm_near_pages; /* Number of pages allocated sequentially */
415 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
416 time_t unixtime; /* Unix time sampled every second. */
417 /* Virtual memory I/O threads stuff */
418 /* An I/O thread process an element taken from the io_jobs queue and
419 * put the result of the operation in the io_done list. While the
420 * job is being processed, it's put on io_processing queue. */
421 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
422 list *io_processing; /* List of VM I/O jobs being processed */
423 list *io_processed; /* List of VM I/O jobs already processed */
424 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
425 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
426 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
427 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
428 pthread_attr_t io_threads_attr; /* attributes for threads creation */
429 int io_active_threads; /* Number of running I/O threads */
430 int vm_max_threads; /* Max number of I/O threads running at the same time */
431 /* Our main thread is blocked on the event loop, locking for sockets ready
432 * to be read or written, so when a threaded I/O operation is ready to be
433 * processed by the main thread, the I/O thread will use a unix pipe to
434 * awake the main thread. The followings are the two pipe FDs. */
435 int io_ready_pipe_read;
436 int io_ready_pipe_write;
437 /* Virtual memory stats */
438 unsigned long long vm_stats_used_pages;
439 unsigned long long vm_stats_swapped_objects;
440 unsigned long long vm_stats_swapouts;
441 unsigned long long vm_stats_swapins;
442 /* Pubsub */
443 dict *pubsub_channels; /* Map channels to list of subscribed clients */
444 list *pubsub_patterns; /* A list of pubsub_patterns */
445 /* Misc */
446 FILE *devnull;
447 };
448
449 typedef struct pubsubPattern {
450 redisClient *client;
451 robj *pattern;
452 } pubsubPattern;
453
454 typedef void redisCommandProc(redisClient *c);
455 struct redisCommand {
456 char *name;
457 redisCommandProc *proc;
458 int arity;
459 int flags;
460 /* Use a function to determine which keys need to be loaded
461 * in the background prior to executing this command. Takes precedence
462 * over vm_firstkey and others, ignored when NULL */
463 redisCommandProc *vm_preload_proc;
464 /* What keys should be loaded in background when calling this command? */
465 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
466 int vm_lastkey; /* THe last argument that's a key */
467 int vm_keystep; /* The step between first and last key */
468 };
469
470 struct redisFunctionSym {
471 char *name;
472 unsigned long pointer;
473 };
474
475 typedef struct _redisSortObject {
476 robj *obj;
477 union {
478 double score;
479 robj *cmpobj;
480 } u;
481 } redisSortObject;
482
483 typedef struct _redisSortOperation {
484 int type;
485 robj *pattern;
486 } redisSortOperation;
487
488 /* ZSETs use a specialized version of Skiplists */
489
490 typedef struct zskiplistNode {
491 struct zskiplistNode **forward;
492 struct zskiplistNode *backward;
493 unsigned int *span;
494 double score;
495 robj *obj;
496 } zskiplistNode;
497
498 typedef struct zskiplist {
499 struct zskiplistNode *header, *tail;
500 unsigned long length;
501 int level;
502 } zskiplist;
503
504 typedef struct zset {
505 dict *dict;
506 zskiplist *zsl;
507 } zset;
508
509 /* Our shared "common" objects */
510
511 #define REDIS_SHARED_INTEGERS 10000
512 struct sharedObjectsStruct {
513 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
514 *colon, *nullbulk, *nullmultibulk, *queued,
515 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
516 *outofrangeerr, *plus,
517 *select0, *select1, *select2, *select3, *select4,
518 *select5, *select6, *select7, *select8, *select9,
519 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
520 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
521 } shared;
522
523 /* Global vars that are actally used as constants. The following double
524 * values are used for double on-disk serialization, and are initialized
525 * at runtime to avoid strange compiler optimizations. */
526
527 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
528
529 /* VM threaded I/O request message */
530 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
531 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
532 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
533 typedef struct iojob {
534 int type; /* Request type, REDIS_IOJOB_* */
535 redisDb *db;/* Redis database */
536 robj *key; /* This I/O request is about swapping this key */
537 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
538 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
539 off_t page; /* Swap page where to read/write the object */
540 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
541 int canceled; /* True if this command was canceled by blocking side of VM */
542 pthread_t thread; /* ID of the thread processing this entry */
543 } iojob;
544
545 /*================================ Prototypes =============================== */
546
547 static void freeStringObject(robj *o);
548 static void freeListObject(robj *o);
549 static void freeSetObject(robj *o);
550 static void decrRefCount(void *o);
551 static robj *createObject(int type, void *ptr);
552 static void freeClient(redisClient *c);
553 static int rdbLoad(char *filename);
554 static void addReply(redisClient *c, robj *obj);
555 static void addReplySds(redisClient *c, sds s);
556 static void incrRefCount(robj *o);
557 static int rdbSaveBackground(char *filename);
558 static robj *createStringObject(char *ptr, size_t len);
559 static robj *dupStringObject(robj *o);
560 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
561 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
562 static int syncWithMaster(void);
563 static robj *tryObjectEncoding(robj *o);
564 static robj *getDecodedObject(robj *o);
565 static int removeExpire(redisDb *db, robj *key);
566 static int expireIfNeeded(redisDb *db, robj *key);
567 static int deleteIfVolatile(redisDb *db, robj *key);
568 static int deleteIfSwapped(redisDb *db, robj *key);
569 static int deleteKey(redisDb *db, robj *key);
570 static time_t getExpire(redisDb *db, robj *key);
571 static int setExpire(redisDb *db, robj *key, time_t when);
572 static void updateSlavesWaitingBgsave(int bgsaveerr);
573 static void freeMemoryIfNeeded(void);
574 static int processCommand(redisClient *c);
575 static void setupSigSegvAction(void);
576 static void rdbRemoveTempFile(pid_t childpid);
577 static void aofRemoveTempFile(pid_t childpid);
578 static size_t stringObjectLen(robj *o);
579 static void processInputBuffer(redisClient *c);
580 static zskiplist *zslCreate(void);
581 static void zslFree(zskiplist *zsl);
582 static void zslInsert(zskiplist *zsl, double score, robj *obj);
583 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
584 static void initClientMultiState(redisClient *c);
585 static void freeClientMultiState(redisClient *c);
586 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
587 static void unblockClientWaitingData(redisClient *c);
588 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
589 static void vmInit(void);
590 static void vmMarkPagesFree(off_t page, off_t count);
591 static robj *vmLoadObject(robj *key);
592 static robj *vmPreviewObject(robj *key);
593 static int vmSwapOneObjectBlocking(void);
594 static int vmSwapOneObjectThreaded(void);
595 static int vmCanSwapOut(void);
596 static int tryFreeOneObjectFromFreelist(void);
597 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
598 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
599 static void vmCancelThreadedIOJob(robj *o);
600 static void lockThreadedIO(void);
601 static void unlockThreadedIO(void);
602 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
603 static void freeIOJob(iojob *j);
604 static void queueIOJob(iojob *j);
605 static int vmWriteObjectOnSwap(robj *o, off_t page);
606 static robj *vmReadObjectFromSwap(off_t page, int type);
607 static void waitEmptyIOJobsQueue(void);
608 static void vmReopenSwapFile(void);
609 static int vmFreePage(off_t page);
610 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
611 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
612 static int dontWaitForSwappedKey(redisClient *c, robj *key);
613 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
614 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
615 static struct redisCommand *lookupCommand(char *name);
616 static void call(redisClient *c, struct redisCommand *cmd);
617 static void resetClient(redisClient *c);
618 static void convertToRealHash(robj *o);
619 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
620 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
621 static void freePubsubPattern(void *p);
622 static int listMatchPubsubPattern(void *a, void *b);
623 static int compareStringObjects(robj *a, robj *b);
624 static void usage();
625
626 static void authCommand(redisClient *c);
627 static void pingCommand(redisClient *c);
628 static void echoCommand(redisClient *c);
629 static void setCommand(redisClient *c);
630 static void setnxCommand(redisClient *c);
631 static void getCommand(redisClient *c);
632 static void delCommand(redisClient *c);
633 static void existsCommand(redisClient *c);
634 static void incrCommand(redisClient *c);
635 static void decrCommand(redisClient *c);
636 static void incrbyCommand(redisClient *c);
637 static void decrbyCommand(redisClient *c);
638 static void selectCommand(redisClient *c);
639 static void randomkeyCommand(redisClient *c);
640 static void keysCommand(redisClient *c);
641 static void dbsizeCommand(redisClient *c);
642 static void lastsaveCommand(redisClient *c);
643 static void saveCommand(redisClient *c);
644 static void bgsaveCommand(redisClient *c);
645 static void bgrewriteaofCommand(redisClient *c);
646 static void shutdownCommand(redisClient *c);
647 static void moveCommand(redisClient *c);
648 static void renameCommand(redisClient *c);
649 static void renamenxCommand(redisClient *c);
650 static void lpushCommand(redisClient *c);
651 static void rpushCommand(redisClient *c);
652 static void lpopCommand(redisClient *c);
653 static void rpopCommand(redisClient *c);
654 static void llenCommand(redisClient *c);
655 static void lindexCommand(redisClient *c);
656 static void lrangeCommand(redisClient *c);
657 static void ltrimCommand(redisClient *c);
658 static void typeCommand(redisClient *c);
659 static void lsetCommand(redisClient *c);
660 static void saddCommand(redisClient *c);
661 static void sremCommand(redisClient *c);
662 static void smoveCommand(redisClient *c);
663 static void sismemberCommand(redisClient *c);
664 static void scardCommand(redisClient *c);
665 static void spopCommand(redisClient *c);
666 static void srandmemberCommand(redisClient *c);
667 static void sinterCommand(redisClient *c);
668 static void sinterstoreCommand(redisClient *c);
669 static void sunionCommand(redisClient *c);
670 static void sunionstoreCommand(redisClient *c);
671 static void sdiffCommand(redisClient *c);
672 static void sdiffstoreCommand(redisClient *c);
673 static void syncCommand(redisClient *c);
674 static void flushdbCommand(redisClient *c);
675 static void flushallCommand(redisClient *c);
676 static void sortCommand(redisClient *c);
677 static void lremCommand(redisClient *c);
678 static void rpoplpushcommand(redisClient *c);
679 static void infoCommand(redisClient *c);
680 static void mgetCommand(redisClient *c);
681 static void monitorCommand(redisClient *c);
682 static void expireCommand(redisClient *c);
683 static void expireatCommand(redisClient *c);
684 static void getsetCommand(redisClient *c);
685 static void ttlCommand(redisClient *c);
686 static void slaveofCommand(redisClient *c);
687 static void debugCommand(redisClient *c);
688 static void msetCommand(redisClient *c);
689 static void msetnxCommand(redisClient *c);
690 static void zaddCommand(redisClient *c);
691 static void zincrbyCommand(redisClient *c);
692 static void zrangeCommand(redisClient *c);
693 static void zrangebyscoreCommand(redisClient *c);
694 static void zcountCommand(redisClient *c);
695 static void zrevrangeCommand(redisClient *c);
696 static void zcardCommand(redisClient *c);
697 static void zremCommand(redisClient *c);
698 static void zscoreCommand(redisClient *c);
699 static void zremrangebyscoreCommand(redisClient *c);
700 static void multiCommand(redisClient *c);
701 static void execCommand(redisClient *c);
702 static void discardCommand(redisClient *c);
703 static void blpopCommand(redisClient *c);
704 static void brpopCommand(redisClient *c);
705 static void appendCommand(redisClient *c);
706 static void substrCommand(redisClient *c);
707 static void zrankCommand(redisClient *c);
708 static void zrevrankCommand(redisClient *c);
709 static void hsetCommand(redisClient *c);
710 static void hsetnxCommand(redisClient *c);
711 static void hgetCommand(redisClient *c);
712 static void hmsetCommand(redisClient *c);
713 static void hmgetCommand(redisClient *c);
714 static void hdelCommand(redisClient *c);
715 static void hlenCommand(redisClient *c);
716 static void zremrangebyrankCommand(redisClient *c);
717 static void zunionCommand(redisClient *c);
718 static void zinterCommand(redisClient *c);
719 static void hkeysCommand(redisClient *c);
720 static void hvalsCommand(redisClient *c);
721 static void hgetallCommand(redisClient *c);
722 static void hexistsCommand(redisClient *c);
723 static void configCommand(redisClient *c);
724 static void hincrbyCommand(redisClient *c);
725 static void subscribeCommand(redisClient *c);
726 static void unsubscribeCommand(redisClient *c);
727 static void psubscribeCommand(redisClient *c);
728 static void punsubscribeCommand(redisClient *c);
729 static void publishCommand(redisClient *c);
730
731 /*================================= Globals ================================= */
732
733 /* Global vars */
734 static struct redisServer server; /* server global state */
735 static struct redisCommand cmdTable[] = {
736 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
738 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
739 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
741 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
742 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
745 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
746 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
747 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
748 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
755 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
758 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
759 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
760 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
761 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
762 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
763 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
767 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
768 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
769 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
770 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
771 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
772 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
774 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
775 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
776 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
779 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
780 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
782 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
786 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
787 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
788 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
789 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
790 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
791 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
793 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
795 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
797 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
800 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
801 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
802 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
803 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
804 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
805 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
817 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
823 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
825 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
830 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
836 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
841 {NULL,NULL,0,0,NULL,0,0,0}
842 };
843
844 /*============================ Utility functions ============================ */
845
846 /* Glob-style pattern matching. */
847 static int stringmatchlen(const char *pattern, int patternLen,
848 const char *string, int stringLen, int nocase)
849 {
850 while(patternLen) {
851 switch(pattern[0]) {
852 case '*':
853 while (pattern[1] == '*') {
854 pattern++;
855 patternLen--;
856 }
857 if (patternLen == 1)
858 return 1; /* match */
859 while(stringLen) {
860 if (stringmatchlen(pattern+1, patternLen-1,
861 string, stringLen, nocase))
862 return 1; /* match */
863 string++;
864 stringLen--;
865 }
866 return 0; /* no match */
867 break;
868 case '?':
869 if (stringLen == 0)
870 return 0; /* no match */
871 string++;
872 stringLen--;
873 break;
874 case '[':
875 {
876 int not, match;
877
878 pattern++;
879 patternLen--;
880 not = pattern[0] == '^';
881 if (not) {
882 pattern++;
883 patternLen--;
884 }
885 match = 0;
886 while(1) {
887 if (pattern[0] == '\\') {
888 pattern++;
889 patternLen--;
890 if (pattern[0] == string[0])
891 match = 1;
892 } else if (pattern[0] == ']') {
893 break;
894 } else if (patternLen == 0) {
895 pattern--;
896 patternLen++;
897 break;
898 } else if (pattern[1] == '-' && patternLen >= 3) {
899 int start = pattern[0];
900 int end = pattern[2];
901 int c = string[0];
902 if (start > end) {
903 int t = start;
904 start = end;
905 end = t;
906 }
907 if (nocase) {
908 start = tolower(start);
909 end = tolower(end);
910 c = tolower(c);
911 }
912 pattern += 2;
913 patternLen -= 2;
914 if (c >= start && c <= end)
915 match = 1;
916 } else {
917 if (!nocase) {
918 if (pattern[0] == string[0])
919 match = 1;
920 } else {
921 if (tolower((int)pattern[0]) == tolower((int)string[0]))
922 match = 1;
923 }
924 }
925 pattern++;
926 patternLen--;
927 }
928 if (not)
929 match = !match;
930 if (!match)
931 return 0; /* no match */
932 string++;
933 stringLen--;
934 break;
935 }
936 case '\\':
937 if (patternLen >= 2) {
938 pattern++;
939 patternLen--;
940 }
941 /* fall through */
942 default:
943 if (!nocase) {
944 if (pattern[0] != string[0])
945 return 0; /* no match */
946 } else {
947 if (tolower((int)pattern[0]) != tolower((int)string[0]))
948 return 0; /* no match */
949 }
950 string++;
951 stringLen--;
952 break;
953 }
954 pattern++;
955 patternLen--;
956 if (stringLen == 0) {
957 while(*pattern == '*') {
958 pattern++;
959 patternLen--;
960 }
961 break;
962 }
963 }
964 if (patternLen == 0 && stringLen == 0)
965 return 1;
966 return 0;
967 }
968
969 static int stringmatch(const char *pattern, const char *string, int nocase) {
970 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
971 }
972
973 static void redisLog(int level, const char *fmt, ...) {
974 va_list ap;
975 FILE *fp;
976
977 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
978 if (!fp) return;
979
980 va_start(ap, fmt);
981 if (level >= server.verbosity) {
982 char *c = ".-*#";
983 char buf[64];
984 time_t now;
985
986 now = time(NULL);
987 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
988 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
989 vfprintf(fp, fmt, ap);
990 fprintf(fp,"\n");
991 fflush(fp);
992 }
993 va_end(ap);
994
995 if (server.logfile) fclose(fp);
996 }
997
998 /*====================== Hash table type implementation ==================== */
999
1000 /* This is an hash table type that uses the SDS dynamic strings libary as
1001 * keys and radis objects as values (objects can hold SDS strings,
1002 * lists, sets). */
1003
1004 static void dictVanillaFree(void *privdata, void *val)
1005 {
1006 DICT_NOTUSED(privdata);
1007 zfree(val);
1008 }
1009
1010 static void dictListDestructor(void *privdata, void *val)
1011 {
1012 DICT_NOTUSED(privdata);
1013 listRelease((list*)val);
1014 }
1015
1016 static int sdsDictKeyCompare(void *privdata, const void *key1,
1017 const void *key2)
1018 {
1019 int l1,l2;
1020 DICT_NOTUSED(privdata);
1021
1022 l1 = sdslen((sds)key1);
1023 l2 = sdslen((sds)key2);
1024 if (l1 != l2) return 0;
1025 return memcmp(key1, key2, l1) == 0;
1026 }
1027
1028 static void dictRedisObjectDestructor(void *privdata, void *val)
1029 {
1030 DICT_NOTUSED(privdata);
1031
1032 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1033 decrRefCount(val);
1034 }
1035
1036 static int dictObjKeyCompare(void *privdata, const void *key1,
1037 const void *key2)
1038 {
1039 const robj *o1 = key1, *o2 = key2;
1040 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1041 }
1042
1043 static unsigned int dictObjHash(const void *key) {
1044 const robj *o = key;
1045 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1046 }
1047
1048 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1049 const void *key2)
1050 {
1051 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1052 int cmp;
1053
1054 if (o1->encoding == REDIS_ENCODING_INT &&
1055 o2->encoding == REDIS_ENCODING_INT &&
1056 o1->ptr == o2->ptr) return 1;
1057
1058 o1 = getDecodedObject(o1);
1059 o2 = getDecodedObject(o2);
1060 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1061 decrRefCount(o1);
1062 decrRefCount(o2);
1063 return cmp;
1064 }
1065
1066 static unsigned int dictEncObjHash(const void *key) {
1067 robj *o = (robj*) key;
1068
1069 if (o->encoding == REDIS_ENCODING_RAW) {
1070 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1071 } else {
1072 if (o->encoding == REDIS_ENCODING_INT) {
1073 char buf[32];
1074 int len;
1075
1076 len = snprintf(buf,32,"%ld",(long)o->ptr);
1077 return dictGenHashFunction((unsigned char*)buf, len);
1078 } else {
1079 unsigned int hash;
1080
1081 o = getDecodedObject(o);
1082 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1083 decrRefCount(o);
1084 return hash;
1085 }
1086 }
1087 }
1088
1089 /* Sets type and expires */
1090 static dictType setDictType = {
1091 dictEncObjHash, /* hash function */
1092 NULL, /* key dup */
1093 NULL, /* val dup */
1094 dictEncObjKeyCompare, /* key compare */
1095 dictRedisObjectDestructor, /* key destructor */
1096 NULL /* val destructor */
1097 };
1098
1099 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1100 static dictType zsetDictType = {
1101 dictEncObjHash, /* hash function */
1102 NULL, /* key dup */
1103 NULL, /* val dup */
1104 dictEncObjKeyCompare, /* key compare */
1105 dictRedisObjectDestructor, /* key destructor */
1106 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1107 };
1108
1109 /* Db->dict */
1110 static dictType dbDictType = {
1111 dictObjHash, /* hash function */
1112 NULL, /* key dup */
1113 NULL, /* val dup */
1114 dictObjKeyCompare, /* key compare */
1115 dictRedisObjectDestructor, /* key destructor */
1116 dictRedisObjectDestructor /* val destructor */
1117 };
1118
1119 /* Db->expires */
1120 static dictType keyptrDictType = {
1121 dictObjHash, /* hash function */
1122 NULL, /* key dup */
1123 NULL, /* val dup */
1124 dictObjKeyCompare, /* key compare */
1125 dictRedisObjectDestructor, /* key destructor */
1126 NULL /* val destructor */
1127 };
1128
1129 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1130 static dictType hashDictType = {
1131 dictEncObjHash, /* hash function */
1132 NULL, /* key dup */
1133 NULL, /* val dup */
1134 dictEncObjKeyCompare, /* key compare */
1135 dictRedisObjectDestructor, /* key destructor */
1136 dictRedisObjectDestructor /* val destructor */
1137 };
1138
1139 /* Keylist hash table type has unencoded redis objects as keys and
1140 * lists as values. It's used for blocking operations (BLPOP) and to
1141 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1142 static dictType keylistDictType = {
1143 dictObjHash, /* hash function */
1144 NULL, /* key dup */
1145 NULL, /* val dup */
1146 dictObjKeyCompare, /* key compare */
1147 dictRedisObjectDestructor, /* key destructor */
1148 dictListDestructor /* val destructor */
1149 };
1150
1151 static void version();
1152
1153 /* ========================= Random utility functions ======================= */
1154
1155 /* Redis generally does not try to recover from out of memory conditions
1156 * when allocating objects or strings, it is not clear if it will be possible
1157 * to report this condition to the client since the networking layer itself
1158 * is based on heap allocation for send buffers, so we simply abort.
1159 * At least the code will be simpler to read... */
1160 static void oom(const char *msg) {
1161 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1162 sleep(1);
1163 abort();
1164 }
1165
1166 /* ====================== Redis server networking stuff ===================== */
1167 static void closeTimedoutClients(void) {
1168 redisClient *c;
1169 listNode *ln;
1170 time_t now = time(NULL);
1171 listIter li;
1172
1173 listRewind(server.clients,&li);
1174 while ((ln = listNext(&li)) != NULL) {
1175 c = listNodeValue(ln);
1176 if (server.maxidletime &&
1177 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1178 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1179 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1180 listLength(c->pubsub_patterns) == 0 &&
1181 (now - c->lastinteraction > server.maxidletime))
1182 {
1183 redisLog(REDIS_VERBOSE,"Closing idle client");
1184 freeClient(c);
1185 } else if (c->flags & REDIS_BLOCKED) {
1186 if (c->blockingto != 0 && c->blockingto < now) {
1187 addReply(c,shared.nullmultibulk);
1188 unblockClientWaitingData(c);
1189 }
1190 }
1191 }
1192 }
1193
1194 static int htNeedsResize(dict *dict) {
1195 long long size, used;
1196
1197 size = dictSlots(dict);
1198 used = dictSize(dict);
1199 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1200 (used*100/size < REDIS_HT_MINFILL));
1201 }
1202
1203 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1204 * we resize the hash table to save memory */
1205 static void tryResizeHashTables(void) {
1206 int j;
1207
1208 for (j = 0; j < server.dbnum; j++) {
1209 if (htNeedsResize(server.db[j].dict))
1210 dictResize(server.db[j].dict);
1211 if (htNeedsResize(server.db[j].expires))
1212 dictResize(server.db[j].expires);
1213 }
1214 }
1215
1216 /* Our hash table implementation performs rehashing incrementally while
1217 * we write/read from the hash table. Still if the server is idle, the hash
1218 * table will use two tables for a long time. So we try to use 1 millisecond
1219 * of CPU time at every serverCron() loop in order to rehash some key. */
1220 static void incrementallyRehash(void) {
1221 int j;
1222
1223 for (j = 0; j < server.dbnum; j++) {
1224 if (dictIsRehashing(server.db[j].dict)) {
1225 dictRehashMilliseconds(server.db[j].dict,1);
1226 break; /* already used our millisecond for this loop... */
1227 }
1228 }
1229 }
1230
1231 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1232 void backgroundSaveDoneHandler(int statloc) {
1233 int exitcode = WEXITSTATUS(statloc);
1234 int bysignal = WIFSIGNALED(statloc);
1235
1236 if (!bysignal && exitcode == 0) {
1237 redisLog(REDIS_NOTICE,
1238 "Background saving terminated with success");
1239 server.dirty = 0;
1240 server.lastsave = time(NULL);
1241 } else if (!bysignal && exitcode != 0) {
1242 redisLog(REDIS_WARNING, "Background saving error");
1243 } else {
1244 redisLog(REDIS_WARNING,
1245 "Background saving terminated by signal %d", WTERMSIG(statloc));
1246 rdbRemoveTempFile(server.bgsavechildpid);
1247 }
1248 server.bgsavechildpid = -1;
1249 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1250 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1251 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1252 }
1253
1254 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1255 * Handle this. */
1256 void backgroundRewriteDoneHandler(int statloc) {
1257 int exitcode = WEXITSTATUS(statloc);
1258 int bysignal = WIFSIGNALED(statloc);
1259
1260 if (!bysignal && exitcode == 0) {
1261 int fd;
1262 char tmpfile[256];
1263
1264 redisLog(REDIS_NOTICE,
1265 "Background append only file rewriting terminated with success");
1266 /* Now it's time to flush the differences accumulated by the parent */
1267 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1268 fd = open(tmpfile,O_WRONLY|O_APPEND);
1269 if (fd == -1) {
1270 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1271 goto cleanup;
1272 }
1273 /* Flush our data... */
1274 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1275 (signed) sdslen(server.bgrewritebuf)) {
1276 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1277 close(fd);
1278 goto cleanup;
1279 }
1280 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1281 /* Now our work is to rename the temp file into the stable file. And
1282 * switch the file descriptor used by the server for append only. */
1283 if (rename(tmpfile,server.appendfilename) == -1) {
1284 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1285 close(fd);
1286 goto cleanup;
1287 }
1288 /* Mission completed... almost */
1289 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1290 if (server.appendfd != -1) {
1291 /* If append only is actually enabled... */
1292 close(server.appendfd);
1293 server.appendfd = fd;
1294 fsync(fd);
1295 server.appendseldb = -1; /* Make sure it will issue SELECT */
1296 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1297 } else {
1298 /* If append only is disabled we just generate a dump in this
1299 * format. Why not? */
1300 close(fd);
1301 }
1302 } else if (!bysignal && exitcode != 0) {
1303 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1304 } else {
1305 redisLog(REDIS_WARNING,
1306 "Background append only file rewriting terminated by signal %d",
1307 WTERMSIG(statloc));
1308 }
1309 cleanup:
1310 sdsfree(server.bgrewritebuf);
1311 server.bgrewritebuf = sdsempty();
1312 aofRemoveTempFile(server.bgrewritechildpid);
1313 server.bgrewritechildpid = -1;
1314 }
1315
1316 /* This function is called once a background process of some kind terminates,
1317 * as we want to avoid resizing the hash tables when there is a child in order
1318 * to play well with copy-on-write (otherwise when a resize happens lots of
1319 * memory pages are copied). The goal of this function is to update the ability
1320 * for dict.c to resize the hash tables accordingly to the fact we have o not
1321 * running childs. */
1322 static void updateDictResizePolicy(void) {
1323 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1324 dictEnableResize();
1325 else
1326 dictDisableResize();
1327 }
1328
1329 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1330 int j, loops = server.cronloops++;
1331 REDIS_NOTUSED(eventLoop);
1332 REDIS_NOTUSED(id);
1333 REDIS_NOTUSED(clientData);
1334
1335 /* We take a cached value of the unix time in the global state because
1336 * with virtual memory and aging there is to store the current time
1337 * in objects at every object access, and accuracy is not needed.
1338 * To access a global var is faster than calling time(NULL) */
1339 server.unixtime = time(NULL);
1340
1341 /* Show some info about non-empty databases */
1342 for (j = 0; j < server.dbnum; j++) {
1343 long long size, used, vkeys;
1344
1345 size = dictSlots(server.db[j].dict);
1346 used = dictSize(server.db[j].dict);
1347 vkeys = dictSize(server.db[j].expires);
1348 if (!(loops % 50) && (used || vkeys)) {
1349 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1350 /* dictPrintStats(server.dict); */
1351 }
1352 }
1353
1354 /* We don't want to resize the hash tables while a bacground saving
1355 * is in progress: the saving child is created using fork() that is
1356 * implemented with a copy-on-write semantic in most modern systems, so
1357 * if we resize the HT while there is the saving child at work actually
1358 * a lot of memory movements in the parent will cause a lot of pages
1359 * copied. */
1360 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1361 if (!(loops % 10)) tryResizeHashTables();
1362 if (server.activerehashing) incrementallyRehash();
1363 }
1364
1365 /* Show information about connected clients */
1366 if (!(loops % 50)) {
1367 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
1368 listLength(server.clients)-listLength(server.slaves),
1369 listLength(server.slaves),
1370 zmalloc_used_memory());
1371 }
1372
1373 /* Close connections of timedout clients */
1374 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1375 closeTimedoutClients();
1376
1377 /* Check if a background saving or AOF rewrite in progress terminated */
1378 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1379 int statloc;
1380 pid_t pid;
1381
1382 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1383 if (pid == server.bgsavechildpid) {
1384 backgroundSaveDoneHandler(statloc);
1385 } else {
1386 backgroundRewriteDoneHandler(statloc);
1387 }
1388 updateDictResizePolicy();
1389 }
1390 } else {
1391 /* If there is not a background saving in progress check if
1392 * we have to save now */
1393 time_t now = time(NULL);
1394 for (j = 0; j < server.saveparamslen; j++) {
1395 struct saveparam *sp = server.saveparams+j;
1396
1397 if (server.dirty >= sp->changes &&
1398 now-server.lastsave > sp->seconds) {
1399 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1400 sp->changes, sp->seconds);
1401 rdbSaveBackground(server.dbfilename);
1402 break;
1403 }
1404 }
1405 }
1406
1407 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1408 * will use few CPU cycles if there are few expiring keys, otherwise
1409 * it will get more aggressive to avoid that too much memory is used by
1410 * keys that can be removed from the keyspace. */
1411 for (j = 0; j < server.dbnum; j++) {
1412 int expired;
1413 redisDb *db = server.db+j;
1414
1415 /* Continue to expire if at the end of the cycle more than 25%
1416 * of the keys were expired. */
1417 do {
1418 long num = dictSize(db->expires);
1419 time_t now = time(NULL);
1420
1421 expired = 0;
1422 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1423 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1424 while (num--) {
1425 dictEntry *de;
1426 time_t t;
1427
1428 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1429 t = (time_t) dictGetEntryVal(de);
1430 if (now > t) {
1431 deleteKey(db,dictGetEntryKey(de));
1432 expired++;
1433 server.stat_expiredkeys++;
1434 }
1435 }
1436 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1437 }
1438
1439 /* Swap a few keys on disk if we are over the memory limit and VM
1440 * is enbled. Try to free objects from the free list first. */
1441 if (vmCanSwapOut()) {
1442 while (server.vm_enabled && zmalloc_used_memory() >
1443 server.vm_max_memory)
1444 {
1445 int retval;
1446
1447 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1448 retval = (server.vm_max_threads == 0) ?
1449 vmSwapOneObjectBlocking() :
1450 vmSwapOneObjectThreaded();
1451 if (retval == REDIS_ERR && !(loops % 300) &&
1452 zmalloc_used_memory() >
1453 (server.vm_max_memory+server.vm_max_memory/10))
1454 {
1455 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1456 }
1457 /* Note that when using threade I/O we free just one object,
1458 * because anyway when the I/O thread in charge to swap this
1459 * object out will finish, the handler of completed jobs
1460 * will try to swap more objects if we are still out of memory. */
1461 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1462 }
1463 }
1464
1465 /* Check if we should connect to a MASTER */
1466 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1467 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1468 if (syncWithMaster() == REDIS_OK) {
1469 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1470 }
1471 }
1472 return 100;
1473 }
1474
1475 /* This function gets called every time Redis is entering the
1476 * main loop of the event driven library, that is, before to sleep
1477 * for ready file descriptors. */
1478 static void beforeSleep(struct aeEventLoop *eventLoop) {
1479 REDIS_NOTUSED(eventLoop);
1480
1481 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1482 listIter li;
1483 listNode *ln;
1484
1485 listRewind(server.io_ready_clients,&li);
1486 while((ln = listNext(&li))) {
1487 redisClient *c = ln->value;
1488 struct redisCommand *cmd;
1489
1490 /* Resume the client. */
1491 listDelNode(server.io_ready_clients,ln);
1492 c->flags &= (~REDIS_IO_WAIT);
1493 server.vm_blocked_clients--;
1494 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1495 readQueryFromClient, c);
1496 cmd = lookupCommand(c->argv[0]->ptr);
1497 assert(cmd != NULL);
1498 call(c,cmd);
1499 resetClient(c);
1500 /* There may be more data to process in the input buffer. */
1501 if (c->querybuf && sdslen(c->querybuf) > 0)
1502 processInputBuffer(c);
1503 }
1504 }
1505 }
1506
1507 static void createSharedObjects(void) {
1508 int j;
1509
1510 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1511 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1512 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1513 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1514 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1515 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1516 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1517 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1518 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1519 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1520 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1521 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1522 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1523 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1524 "-ERR no such key\r\n"));
1525 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1526 "-ERR syntax error\r\n"));
1527 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1528 "-ERR source and destination objects are the same\r\n"));
1529 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1530 "-ERR index out of range\r\n"));
1531 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1532 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1533 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1534 shared.select0 = createStringObject("select 0\r\n",10);
1535 shared.select1 = createStringObject("select 1\r\n",10);
1536 shared.select2 = createStringObject("select 2\r\n",10);
1537 shared.select3 = createStringObject("select 3\r\n",10);
1538 shared.select4 = createStringObject("select 4\r\n",10);
1539 shared.select5 = createStringObject("select 5\r\n",10);
1540 shared.select6 = createStringObject("select 6\r\n",10);
1541 shared.select7 = createStringObject("select 7\r\n",10);
1542 shared.select8 = createStringObject("select 8\r\n",10);
1543 shared.select9 = createStringObject("select 9\r\n",10);
1544 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1545 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1546 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
1547 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1548 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
1549 shared.mbulk3 = createStringObject("*3\r\n",4);
1550 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1551 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1552 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1553 }
1554 }
1555
1556 static void appendServerSaveParams(time_t seconds, int changes) {
1557 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1558 server.saveparams[server.saveparamslen].seconds = seconds;
1559 server.saveparams[server.saveparamslen].changes = changes;
1560 server.saveparamslen++;
1561 }
1562
1563 static void resetServerSaveParams() {
1564 zfree(server.saveparams);
1565 server.saveparams = NULL;
1566 server.saveparamslen = 0;
1567 }
1568
1569 static void initServerConfig() {
1570 server.dbnum = REDIS_DEFAULT_DBNUM;
1571 server.port = REDIS_SERVERPORT;
1572 server.verbosity = REDIS_VERBOSE;
1573 server.maxidletime = REDIS_MAXIDLETIME;
1574 server.saveparams = NULL;
1575 server.logfile = NULL; /* NULL = log on standard output */
1576 server.bindaddr = NULL;
1577 server.glueoutputbuf = 1;
1578 server.daemonize = 0;
1579 server.appendonly = 0;
1580 server.appendfsync = APPENDFSYNC_ALWAYS;
1581 server.lastfsync = time(NULL);
1582 server.appendfd = -1;
1583 server.appendseldb = -1; /* Make sure the first time will not match */
1584 server.pidfile = zstrdup("/var/run/redis.pid");
1585 server.dbfilename = zstrdup("dump.rdb");
1586 server.appendfilename = zstrdup("appendonly.aof");
1587 server.requirepass = NULL;
1588 server.shareobjects = 0;
1589 server.rdbcompression = 1;
1590 server.activerehashing = 1;
1591 server.maxclients = 0;
1592 server.blpop_blocked_clients = 0;
1593 server.maxmemory = 0;
1594 server.vm_enabled = 0;
1595 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1596 server.vm_page_size = 256; /* 256 bytes per page */
1597 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1598 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1599 server.vm_max_threads = 4;
1600 server.vm_blocked_clients = 0;
1601 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1602 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1603
1604 resetServerSaveParams();
1605
1606 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1607 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1608 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1609 /* Replication related */
1610 server.isslave = 0;
1611 server.masterauth = NULL;
1612 server.masterhost = NULL;
1613 server.masterport = 6379;
1614 server.master = NULL;
1615 server.replstate = REDIS_REPL_NONE;
1616
1617 /* Double constants initialization */
1618 R_Zero = 0.0;
1619 R_PosInf = 1.0/R_Zero;
1620 R_NegInf = -1.0/R_Zero;
1621 R_Nan = R_Zero/R_Zero;
1622 }
1623
1624 static void initServer() {
1625 int j;
1626
1627 signal(SIGHUP, SIG_IGN);
1628 signal(SIGPIPE, SIG_IGN);
1629 setupSigSegvAction();
1630
1631 server.devnull = fopen("/dev/null","w");
1632 if (server.devnull == NULL) {
1633 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1634 exit(1);
1635 }
1636 server.clients = listCreate();
1637 server.slaves = listCreate();
1638 server.monitors = listCreate();
1639 server.objfreelist = listCreate();
1640 createSharedObjects();
1641 server.el = aeCreateEventLoop();
1642 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1643 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1644 if (server.fd == -1) {
1645 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1646 exit(1);
1647 }
1648 for (j = 0; j < server.dbnum; j++) {
1649 server.db[j].dict = dictCreate(&dbDictType,NULL);
1650 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1651 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1652 if (server.vm_enabled)
1653 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1654 server.db[j].id = j;
1655 }
1656 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1657 server.pubsub_patterns = listCreate();
1658 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1659 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
1660 server.cronloops = 0;
1661 server.bgsavechildpid = -1;
1662 server.bgrewritechildpid = -1;
1663 server.bgrewritebuf = sdsempty();
1664 server.lastsave = time(NULL);
1665 server.dirty = 0;
1666 server.stat_numcommands = 0;
1667 server.stat_numconnections = 0;
1668 server.stat_expiredkeys = 0;
1669 server.stat_starttime = time(NULL);
1670 server.unixtime = time(NULL);
1671 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1672 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1673 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1674
1675 if (server.appendonly) {
1676 int flags = O_WRONLY|O_APPEND|O_CREAT;
1677
1678 #ifdef HAVE_O_DIRECT
1679 if (server.appendfsync == APPENDFSYNC_ALWAYS) {
1680 flags |= O_DIRECT;
1681 server.appendfsync = APPENDFSYNC_NO;
1682 }
1683 #endif
1684
1685 server.appendfd = open(server.appendfilename,flags,0644);
1686 if (server.appendfd == -1) {
1687 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1688 strerror(errno));
1689 exit(1);
1690 }
1691 }
1692
1693 if (server.vm_enabled) vmInit();
1694 }
1695
1696 /* Empty the whole database */
1697 static long long emptyDb() {
1698 int j;
1699 long long removed = 0;
1700
1701 for (j = 0; j < server.dbnum; j++) {
1702 removed += dictSize(server.db[j].dict);
1703 dictEmpty(server.db[j].dict);
1704 dictEmpty(server.db[j].expires);
1705 }
1706 return removed;
1707 }
1708
1709 static int yesnotoi(char *s) {
1710 if (!strcasecmp(s,"yes")) return 1;
1711 else if (!strcasecmp(s,"no")) return 0;
1712 else return -1;
1713 }
1714
1715 /* I agree, this is a very rudimental way to load a configuration...
1716 will improve later if the config gets more complex */
1717 static void loadServerConfig(char *filename) {
1718 FILE *fp;
1719 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1720 int linenum = 0;
1721 sds line = NULL;
1722
1723 if (filename[0] == '-' && filename[1] == '\0')
1724 fp = stdin;
1725 else {
1726 if ((fp = fopen(filename,"r")) == NULL) {
1727 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1728 exit(1);
1729 }
1730 }
1731
1732 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1733 sds *argv;
1734 int argc, j;
1735
1736 linenum++;
1737 line = sdsnew(buf);
1738 line = sdstrim(line," \t\r\n");
1739
1740 /* Skip comments and blank lines*/
1741 if (line[0] == '#' || line[0] == '\0') {
1742 sdsfree(line);
1743 continue;
1744 }
1745
1746 /* Split into arguments */
1747 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1748 sdstolower(argv[0]);
1749
1750 /* Execute config directives */
1751 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1752 server.maxidletime = atoi(argv[1]);
1753 if (server.maxidletime < 0) {
1754 err = "Invalid timeout value"; goto loaderr;
1755 }
1756 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1757 server.port = atoi(argv[1]);
1758 if (server.port < 1 || server.port > 65535) {
1759 err = "Invalid port"; goto loaderr;
1760 }
1761 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1762 server.bindaddr = zstrdup(argv[1]);
1763 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1764 int seconds = atoi(argv[1]);
1765 int changes = atoi(argv[2]);
1766 if (seconds < 1 || changes < 0) {
1767 err = "Invalid save parameters"; goto loaderr;
1768 }
1769 appendServerSaveParams(seconds,changes);
1770 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1771 if (chdir(argv[1]) == -1) {
1772 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1773 argv[1], strerror(errno));
1774 exit(1);
1775 }
1776 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1777 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1778 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1779 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1780 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1781 else {
1782 err = "Invalid log level. Must be one of debug, notice, warning";
1783 goto loaderr;
1784 }
1785 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1786 FILE *logfp;
1787
1788 server.logfile = zstrdup(argv[1]);
1789 if (!strcasecmp(server.logfile,"stdout")) {
1790 zfree(server.logfile);
1791 server.logfile = NULL;
1792 }
1793 if (server.logfile) {
1794 /* Test if we are able to open the file. The server will not
1795 * be able to abort just for this problem later... */
1796 logfp = fopen(server.logfile,"a");
1797 if (logfp == NULL) {
1798 err = sdscatprintf(sdsempty(),
1799 "Can't open the log file: %s", strerror(errno));
1800 goto loaderr;
1801 }
1802 fclose(logfp);
1803 }
1804 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1805 server.dbnum = atoi(argv[1]);
1806 if (server.dbnum < 1) {
1807 err = "Invalid number of databases"; goto loaderr;
1808 }
1809 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1810 loadServerConfig(argv[1]);
1811 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1812 server.maxclients = atoi(argv[1]);
1813 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1814 server.maxmemory = strtoll(argv[1], NULL, 10);
1815 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1816 server.masterhost = sdsnew(argv[1]);
1817 server.masterport = atoi(argv[2]);
1818 server.replstate = REDIS_REPL_CONNECT;
1819 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1820 server.masterauth = zstrdup(argv[1]);
1821 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1822 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1823 err = "argument must be 'yes' or 'no'"; goto loaderr;
1824 }
1825 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1826 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1827 err = "argument must be 'yes' or 'no'"; goto loaderr;
1828 }
1829 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1830 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1831 err = "argument must be 'yes' or 'no'"; goto loaderr;
1832 }
1833 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1834 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
1835 err = "argument must be 'yes' or 'no'"; goto loaderr;
1836 }
1837 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1838 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1839 err = "argument must be 'yes' or 'no'"; goto loaderr;
1840 }
1841 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1842 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1843 err = "argument must be 'yes' or 'no'"; goto loaderr;
1844 }
1845 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1846 if (!strcasecmp(argv[1],"no")) {
1847 server.appendfsync = APPENDFSYNC_NO;
1848 } else if (!strcasecmp(argv[1],"always")) {
1849 server.appendfsync = APPENDFSYNC_ALWAYS;
1850 } else if (!strcasecmp(argv[1],"everysec")) {
1851 server.appendfsync = APPENDFSYNC_EVERYSEC;
1852 } else {
1853 err = "argument must be 'no', 'always' or 'everysec'";
1854 goto loaderr;
1855 }
1856 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1857 server.requirepass = zstrdup(argv[1]);
1858 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1859 zfree(server.pidfile);
1860 server.pidfile = zstrdup(argv[1]);
1861 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1862 zfree(server.dbfilename);
1863 server.dbfilename = zstrdup(argv[1]);
1864 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1865 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1866 err = "argument must be 'yes' or 'no'"; goto loaderr;
1867 }
1868 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1869 zfree(server.vm_swap_file);
1870 server.vm_swap_file = zstrdup(argv[1]);
1871 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1872 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1873 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1874 server.vm_page_size = strtoll(argv[1], NULL, 10);
1875 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1876 server.vm_pages = strtoll(argv[1], NULL, 10);
1877 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1878 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1879 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1880 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1881 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1882 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1883 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1884 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1885 } else {
1886 err = "Bad directive or wrong number of arguments"; goto loaderr;
1887 }
1888 for (j = 0; j < argc; j++)
1889 sdsfree(argv[j]);
1890 zfree(argv);
1891 sdsfree(line);
1892 }
1893 if (fp != stdin) fclose(fp);
1894 return;
1895
1896 loaderr:
1897 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1898 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1899 fprintf(stderr, ">>> '%s'\n", line);
1900 fprintf(stderr, "%s\n", err);
1901 exit(1);
1902 }
1903
1904 static void freeClientArgv(redisClient *c) {
1905 int j;
1906
1907 for (j = 0; j < c->argc; j++)
1908 decrRefCount(c->argv[j]);
1909 for (j = 0; j < c->mbargc; j++)
1910 decrRefCount(c->mbargv[j]);
1911 c->argc = 0;
1912 c->mbargc = 0;
1913 }
1914
1915 static void freeClient(redisClient *c) {
1916 listNode *ln;
1917
1918 /* Note that if the client we are freeing is blocked into a blocking
1919 * call, we have to set querybuf to NULL *before* to call
1920 * unblockClientWaitingData() to avoid processInputBuffer() will get
1921 * called. Also it is important to remove the file events after
1922 * this, because this call adds the READABLE event. */
1923 sdsfree(c->querybuf);
1924 c->querybuf = NULL;
1925 if (c->flags & REDIS_BLOCKED)
1926 unblockClientWaitingData(c);
1927
1928 /* Unsubscribe from all the pubsub channels */
1929 pubsubUnsubscribeAllChannels(c,0);
1930 pubsubUnsubscribeAllPatterns(c,0);
1931 dictRelease(c->pubsub_channels);
1932 listRelease(c->pubsub_patterns);
1933 /* Obvious cleanup */
1934 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1935 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1936 listRelease(c->reply);
1937 freeClientArgv(c);
1938 close(c->fd);
1939 /* Remove from the list of clients */
1940 ln = listSearchKey(server.clients,c);
1941 redisAssert(ln != NULL);
1942 listDelNode(server.clients,ln);
1943 /* Remove from the list of clients waiting for swapped keys */
1944 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1945 ln = listSearchKey(server.io_ready_clients,c);
1946 if (ln) {
1947 listDelNode(server.io_ready_clients,ln);
1948 server.vm_blocked_clients--;
1949 }
1950 }
1951 while (server.vm_enabled && listLength(c->io_keys)) {
1952 ln = listFirst(c->io_keys);
1953 dontWaitForSwappedKey(c,ln->value);
1954 }
1955 listRelease(c->io_keys);
1956 /* Master/slave cleanup */
1957 if (c->flags & REDIS_SLAVE) {
1958 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1959 close(c->repldbfd);
1960 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1961 ln = listSearchKey(l,c);
1962 redisAssert(ln != NULL);
1963 listDelNode(l,ln);
1964 }
1965 if (c->flags & REDIS_MASTER) {
1966 server.master = NULL;
1967 server.replstate = REDIS_REPL_CONNECT;
1968 }
1969 /* Release memory */
1970 zfree(c->argv);
1971 zfree(c->mbargv);
1972 freeClientMultiState(c);
1973 zfree(c);
1974 }
1975
1976 #define GLUEREPLY_UP_TO (1024)
1977 static void glueReplyBuffersIfNeeded(redisClient *c) {
1978 int copylen = 0;
1979 char buf[GLUEREPLY_UP_TO];
1980 listNode *ln;
1981 listIter li;
1982 robj *o;
1983
1984 listRewind(c->reply,&li);
1985 while((ln = listNext(&li))) {
1986 int objlen;
1987
1988 o = ln->value;
1989 objlen = sdslen(o->ptr);
1990 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1991 memcpy(buf+copylen,o->ptr,objlen);
1992 copylen += objlen;
1993 listDelNode(c->reply,ln);
1994 } else {
1995 if (copylen == 0) return;
1996 break;
1997 }
1998 }
1999 /* Now the output buffer is empty, add the new single element */
2000 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2001 listAddNodeHead(c->reply,o);
2002 }
2003
2004 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2005 redisClient *c = privdata;
2006 int nwritten = 0, totwritten = 0, objlen;
2007 robj *o;
2008 REDIS_NOTUSED(el);
2009 REDIS_NOTUSED(mask);
2010
2011 /* Use writev() if we have enough buffers to send */
2012 if (!server.glueoutputbuf &&
2013 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
2014 !(c->flags & REDIS_MASTER))
2015 {
2016 sendReplyToClientWritev(el, fd, privdata, mask);
2017 return;
2018 }
2019
2020 while(listLength(c->reply)) {
2021 if (server.glueoutputbuf && listLength(c->reply) > 1)
2022 glueReplyBuffersIfNeeded(c);
2023
2024 o = listNodeValue(listFirst(c->reply));
2025 objlen = sdslen(o->ptr);
2026
2027 if (objlen == 0) {
2028 listDelNode(c->reply,listFirst(c->reply));
2029 continue;
2030 }
2031
2032 if (c->flags & REDIS_MASTER) {
2033 /* Don't reply to a master */
2034 nwritten = objlen - c->sentlen;
2035 } else {
2036 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
2037 if (nwritten <= 0) break;
2038 }
2039 c->sentlen += nwritten;
2040 totwritten += nwritten;
2041 /* If we fully sent the object on head go to the next one */
2042 if (c->sentlen == objlen) {
2043 listDelNode(c->reply,listFirst(c->reply));
2044 c->sentlen = 0;
2045 }
2046 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
2047 * bytes, in a single threaded server it's a good idea to serve
2048 * other clients as well, even if a very large request comes from
2049 * super fast link that is always able to accept data (in real world
2050 * scenario think about 'KEYS *' against the loopback interfae) */
2051 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
2052 }
2053 if (nwritten == -1) {
2054 if (errno == EAGAIN) {
2055 nwritten = 0;
2056 } else {
2057 redisLog(REDIS_VERBOSE,
2058 "Error writing to client: %s", strerror(errno));
2059 freeClient(c);
2060 return;
2061 }
2062 }
2063 if (totwritten > 0) c->lastinteraction = time(NULL);
2064 if (listLength(c->reply) == 0) {
2065 c->sentlen = 0;
2066 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2067 }
2068 }
2069
2070 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2071 {
2072 redisClient *c = privdata;
2073 int nwritten = 0, totwritten = 0, objlen, willwrite;
2074 robj *o;
2075 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2076 int offset, ion = 0;
2077 REDIS_NOTUSED(el);
2078 REDIS_NOTUSED(mask);
2079
2080 listNode *node;
2081 while (listLength(c->reply)) {
2082 offset = c->sentlen;
2083 ion = 0;
2084 willwrite = 0;
2085
2086 /* fill-in the iov[] array */
2087 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2088 o = listNodeValue(node);
2089 objlen = sdslen(o->ptr);
2090
2091 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2092 break;
2093
2094 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2095 break; /* no more iovecs */
2096
2097 iov[ion].iov_base = ((char*)o->ptr) + offset;
2098 iov[ion].iov_len = objlen - offset;
2099 willwrite += objlen - offset;
2100 offset = 0; /* just for the first item */
2101 ion++;
2102 }
2103
2104 if(willwrite == 0)
2105 break;
2106
2107 /* write all collected blocks at once */
2108 if((nwritten = writev(fd, iov, ion)) < 0) {
2109 if (errno != EAGAIN) {
2110 redisLog(REDIS_VERBOSE,
2111 "Error writing to client: %s", strerror(errno));
2112 freeClient(c);
2113 return;
2114 }
2115 break;
2116 }
2117
2118 totwritten += nwritten;
2119 offset = c->sentlen;
2120
2121 /* remove written robjs from c->reply */
2122 while (nwritten && listLength(c->reply)) {
2123 o = listNodeValue(listFirst(c->reply));
2124 objlen = sdslen(o->ptr);
2125
2126 if(nwritten >= objlen - offset) {
2127 listDelNode(c->reply, listFirst(c->reply));
2128 nwritten -= objlen - offset;
2129 c->sentlen = 0;
2130 } else {
2131 /* partial write */
2132 c->sentlen += nwritten;
2133 break;
2134 }
2135 offset = 0;
2136 }
2137 }
2138
2139 if (totwritten > 0)
2140 c->lastinteraction = time(NULL);
2141
2142 if (listLength(c->reply) == 0) {
2143 c->sentlen = 0;
2144 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2145 }
2146 }
2147
2148 static struct redisCommand *lookupCommand(char *name) {
2149 int j = 0;
2150 while(cmdTable[j].name != NULL) {
2151 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2152 j++;
2153 }
2154 return NULL;
2155 }
2156
2157 /* resetClient prepare the client to process the next command */
2158 static void resetClient(redisClient *c) {
2159 freeClientArgv(c);
2160 c->bulklen = -1;
2161 c->multibulk = 0;
2162 }
2163
2164 /* Call() is the core of Redis execution of a command */
2165 static void call(redisClient *c, struct redisCommand *cmd) {
2166 long long dirty;
2167
2168 dirty = server.dirty;
2169 cmd->proc(c);
2170 dirty = server.dirty-dirty;
2171
2172 if (server.appendonly && dirty)
2173 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2174 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2175 listLength(server.slaves))
2176 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2177 if (listLength(server.monitors))
2178 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2179 server.stat_numcommands++;
2180 }
2181
2182 /* If this function gets called we already read a whole
2183 * command, argments are in the client argv/argc fields.
2184 * processCommand() execute the command or prepare the
2185 * server for a bulk read from the client.
2186 *
2187 * If 1 is returned the client is still alive and valid and
2188 * and other operations can be performed by the caller. Otherwise
2189 * if 0 is returned the client was destroied (i.e. after QUIT). */
2190 static int processCommand(redisClient *c) {
2191 struct redisCommand *cmd;
2192
2193 /* Free some memory if needed (maxmemory setting) */
2194 if (server.maxmemory) freeMemoryIfNeeded();
2195
2196 /* Handle the multi bulk command type. This is an alternative protocol
2197 * supported by Redis in order to receive commands that are composed of
2198 * multiple binary-safe "bulk" arguments. The latency of processing is
2199 * a bit higher but this allows things like multi-sets, so if this
2200 * protocol is used only for MSET and similar commands this is a big win. */
2201 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2202 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2203 if (c->multibulk <= 0) {
2204 resetClient(c);
2205 return 1;
2206 } else {
2207 decrRefCount(c->argv[c->argc-1]);
2208 c->argc--;
2209 return 1;
2210 }
2211 } else if (c->multibulk) {
2212 if (c->bulklen == -1) {
2213 if (((char*)c->argv[0]->ptr)[0] != '$') {
2214 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2215 resetClient(c);
2216 return 1;
2217 } else {
2218 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2219 decrRefCount(c->argv[0]);
2220 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2221 c->argc--;
2222 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2223 resetClient(c);
2224 return 1;
2225 }
2226 c->argc--;
2227 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2228 return 1;
2229 }
2230 } else {
2231 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2232 c->mbargv[c->mbargc] = c->argv[0];
2233 c->mbargc++;
2234 c->argc--;
2235 c->multibulk--;
2236 if (c->multibulk == 0) {
2237 robj **auxargv;
2238 int auxargc;
2239
2240 /* Here we need to swap the multi-bulk argc/argv with the
2241 * normal argc/argv of the client structure. */
2242 auxargv = c->argv;
2243 c->argv = c->mbargv;
2244 c->mbargv = auxargv;
2245
2246 auxargc = c->argc;
2247 c->argc = c->mbargc;
2248 c->mbargc = auxargc;
2249
2250 /* We need to set bulklen to something different than -1
2251 * in order for the code below to process the command without
2252 * to try to read the last argument of a bulk command as
2253 * a special argument. */
2254 c->bulklen = 0;
2255 /* continue below and process the command */
2256 } else {
2257 c->bulklen = -1;
2258 return 1;
2259 }
2260 }
2261 }
2262 /* -- end of multi bulk commands processing -- */
2263
2264 /* The QUIT command is handled as a special case. Normal command
2265 * procs are unable to close the client connection safely */
2266 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2267 freeClient(c);
2268 return 0;
2269 }
2270
2271 /* Now lookup the command and check ASAP about trivial error conditions
2272 * such wrong arity, bad command name and so forth. */
2273 cmd = lookupCommand(c->argv[0]->ptr);
2274 if (!cmd) {
2275 addReplySds(c,
2276 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2277 (char*)c->argv[0]->ptr));
2278 resetClient(c);
2279 return 1;
2280 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2281 (c->argc < -cmd->arity)) {
2282 addReplySds(c,
2283 sdscatprintf(sdsempty(),
2284 "-ERR wrong number of arguments for '%s' command\r\n",
2285 cmd->name));
2286 resetClient(c);
2287 return 1;
2288 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2289 /* This is a bulk command, we have to read the last argument yet. */
2290 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2291
2292 decrRefCount(c->argv[c->argc-1]);
2293 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2294 c->argc--;
2295 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2296 resetClient(c);
2297 return 1;
2298 }
2299 c->argc--;
2300 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2301 /* It is possible that the bulk read is already in the
2302 * buffer. Check this condition and handle it accordingly.
2303 * This is just a fast path, alternative to call processInputBuffer().
2304 * It's a good idea since the code is small and this condition
2305 * happens most of the times. */
2306 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2307 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2308 c->argc++;
2309 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2310 } else {
2311 /* Otherwise return... there is to read the last argument
2312 * from the socket. */
2313 return 1;
2314 }
2315 }
2316 /* Let's try to encode the bulk object to save space. */
2317 if (cmd->flags & REDIS_CMD_BULK)
2318 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2319
2320 /* Check if the user is authenticated */
2321 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2322 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2323 resetClient(c);
2324 return 1;
2325 }
2326
2327 /* Handle the maxmemory directive */
2328 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2329 zmalloc_used_memory() > server.maxmemory)
2330 {
2331 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2332 resetClient(c);
2333 return 1;
2334 }
2335
2336 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
2337 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2338 &&
2339 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2340 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2341 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2342 resetClient(c);
2343 return 1;
2344 }
2345
2346 /* Exec the command */
2347 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2348 queueMultiCommand(c,cmd);
2349 addReply(c,shared.queued);
2350 } else {
2351 if (server.vm_enabled && server.vm_max_threads > 0 &&
2352 blockClientOnSwappedKeys(cmd,c)) return 1;
2353 call(c,cmd);
2354 }
2355
2356 /* Prepare the client for the next command */
2357 resetClient(c);
2358 return 1;
2359 }
2360
2361 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2362 listNode *ln;
2363 listIter li;
2364 int outc = 0, j;
2365 robj **outv;
2366 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2367 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2368 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2369 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2370 robj *lenobj;
2371
2372 if (argc <= REDIS_STATIC_ARGS) {
2373 outv = static_outv;
2374 } else {
2375 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2376 }
2377
2378 lenobj = createObject(REDIS_STRING,
2379 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2380 lenobj->refcount = 0;
2381 outv[outc++] = lenobj;
2382 for (j = 0; j < argc; j++) {
2383 lenobj = createObject(REDIS_STRING,
2384 sdscatprintf(sdsempty(),"$%lu\r\n",
2385 (unsigned long) stringObjectLen(argv[j])));
2386 lenobj->refcount = 0;
2387 outv[outc++] = lenobj;
2388 outv[outc++] = argv[j];
2389 outv[outc++] = shared.crlf;
2390 }
2391
2392 /* Increment all the refcounts at start and decrement at end in order to
2393 * be sure to free objects if there is no slave in a replication state
2394 * able to be feed with commands */
2395 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2396 listRewind(slaves,&li);
2397 while((ln = listNext(&li))) {
2398 redisClient *slave = ln->value;
2399
2400 /* Don't feed slaves that are still waiting for BGSAVE to start */
2401 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2402
2403 /* Feed all the other slaves, MONITORs and so on */
2404 if (slave->slaveseldb != dictid) {
2405 robj *selectcmd;
2406
2407 switch(dictid) {
2408 case 0: selectcmd = shared.select0; break;
2409 case 1: selectcmd = shared.select1; break;
2410 case 2: selectcmd = shared.select2; break;
2411 case 3: selectcmd = shared.select3; break;
2412 case 4: selectcmd = shared.select4; break;
2413 case 5: selectcmd = shared.select5; break;
2414 case 6: selectcmd = shared.select6; break;
2415 case 7: selectcmd = shared.select7; break;
2416 case 8: selectcmd = shared.select8; break;
2417 case 9: selectcmd = shared.select9; break;
2418 default:
2419 selectcmd = createObject(REDIS_STRING,
2420 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2421 selectcmd->refcount = 0;
2422 break;
2423 }
2424 addReply(slave,selectcmd);
2425 slave->slaveseldb = dictid;
2426 }
2427 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2428 }
2429 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2430 if (outv != static_outv) zfree(outv);
2431 }
2432
2433 static void processInputBuffer(redisClient *c) {
2434 again:
2435 /* Before to process the input buffer, make sure the client is not
2436 * waitig for a blocking operation such as BLPOP. Note that the first
2437 * iteration the client is never blocked, otherwise the processInputBuffer
2438 * would not be called at all, but after the execution of the first commands
2439 * in the input buffer the client may be blocked, and the "goto again"
2440 * will try to reiterate. The following line will make it return asap. */
2441 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2442 if (c->bulklen == -1) {
2443 /* Read the first line of the query */
2444 char *p = strchr(c->querybuf,'\n');
2445 size_t querylen;
2446
2447 if (p) {
2448 sds query, *argv;
2449 int argc, j;
2450
2451 query = c->querybuf;
2452 c->querybuf = sdsempty();
2453 querylen = 1+(p-(query));
2454 if (sdslen(query) > querylen) {
2455 /* leave data after the first line of the query in the buffer */
2456 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2457 }
2458 *p = '\0'; /* remove "\n" */
2459 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2460 sdsupdatelen(query);
2461
2462 /* Now we can split the query in arguments */
2463 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2464 sdsfree(query);
2465
2466 if (c->argv) zfree(c->argv);
2467 c->argv = zmalloc(sizeof(robj*)*argc);
2468
2469 for (j = 0; j < argc; j++) {
2470 if (sdslen(argv[j])) {
2471 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2472 c->argc++;
2473 } else {
2474 sdsfree(argv[j]);
2475 }
2476 }
2477 zfree(argv);
2478 if (c->argc) {
2479 /* Execute the command. If the client is still valid
2480 * after processCommand() return and there is something
2481 * on the query buffer try to process the next command. */
2482 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2483 } else {
2484 /* Nothing to process, argc == 0. Just process the query
2485 * buffer if it's not empty or return to the caller */
2486 if (sdslen(c->querybuf)) goto again;
2487 }
2488 return;
2489 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2490 redisLog(REDIS_VERBOSE, "Client protocol error");
2491 freeClient(c);
2492 return;
2493 }
2494 } else {
2495 /* Bulk read handling. Note that if we are at this point
2496 the client already sent a command terminated with a newline,
2497 we are reading the bulk data that is actually the last
2498 argument of the command. */
2499 int qbl = sdslen(c->querybuf);
2500
2501 if (c->bulklen <= qbl) {
2502 /* Copy everything but the final CRLF as final argument */
2503 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2504 c->argc++;
2505 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2506 /* Process the command. If the client is still valid after
2507 * the processing and there is more data in the buffer
2508 * try to parse it. */
2509 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2510 return;
2511 }
2512 }
2513 }
2514
2515 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2516 redisClient *c = (redisClient*) privdata;
2517 char buf[REDIS_IOBUF_LEN];
2518 int nread;
2519 REDIS_NOTUSED(el);
2520 REDIS_NOTUSED(mask);
2521
2522 nread = read(fd, buf, REDIS_IOBUF_LEN);
2523 if (nread == -1) {
2524 if (errno == EAGAIN) {
2525 nread = 0;
2526 } else {
2527 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2528 freeClient(c);
2529 return;
2530 }
2531 } else if (nread == 0) {
2532 redisLog(REDIS_VERBOSE, "Client closed connection");
2533 freeClient(c);
2534 return;
2535 }
2536 if (nread) {
2537 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2538 c->lastinteraction = time(NULL);
2539 } else {
2540 return;
2541 }
2542 processInputBuffer(c);
2543 }
2544
2545 static int selectDb(redisClient *c, int id) {
2546 if (id < 0 || id >= server.dbnum)
2547 return REDIS_ERR;
2548 c->db = &server.db[id];
2549 return REDIS_OK;
2550 }
2551
2552 static void *dupClientReplyValue(void *o) {
2553 incrRefCount((robj*)o);
2554 return o;
2555 }
2556
2557 static int listMatchObjects(void *a, void *b) {
2558 return compareStringObjects(a,b) == 0;
2559 }
2560
2561 static redisClient *createClient(int fd) {
2562 redisClient *c = zmalloc(sizeof(*c));
2563
2564 anetNonBlock(NULL,fd);
2565 anetTcpNoDelay(NULL,fd);
2566 if (!c) return NULL;
2567 selectDb(c,0);
2568 c->fd = fd;
2569 c->querybuf = sdsempty();
2570 c->argc = 0;
2571 c->argv = NULL;
2572 c->bulklen = -1;
2573 c->multibulk = 0;
2574 c->mbargc = 0;
2575 c->mbargv = NULL;
2576 c->sentlen = 0;
2577 c->flags = 0;
2578 c->lastinteraction = time(NULL);
2579 c->authenticated = 0;
2580 c->replstate = REDIS_REPL_NONE;
2581 c->reply = listCreate();
2582 listSetFreeMethod(c->reply,decrRefCount);
2583 listSetDupMethod(c->reply,dupClientReplyValue);
2584 c->blockingkeys = NULL;
2585 c->blockingkeysnum = 0;
2586 c->io_keys = listCreate();
2587 listSetFreeMethod(c->io_keys,decrRefCount);
2588 c->pubsub_channels = dictCreate(&setDictType,NULL);
2589 c->pubsub_patterns = listCreate();
2590 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2591 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
2592 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2593 readQueryFromClient, c) == AE_ERR) {
2594 freeClient(c);
2595 return NULL;
2596 }
2597 listAddNodeTail(server.clients,c);
2598 initClientMultiState(c);
2599 return c;
2600 }
2601
2602 static void addReply(redisClient *c, robj *obj) {
2603 if (listLength(c->reply) == 0 &&
2604 (c->replstate == REDIS_REPL_NONE ||
2605 c->replstate == REDIS_REPL_ONLINE) &&
2606 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2607 sendReplyToClient, c) == AE_ERR) return;
2608
2609 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2610 obj = dupStringObject(obj);
2611 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2612 }
2613 listAddNodeTail(c->reply,getDecodedObject(obj));
2614 }
2615
2616 static void addReplySds(redisClient *c, sds s) {
2617 robj *o = createObject(REDIS_STRING,s);
2618 addReply(c,o);
2619 decrRefCount(o);
2620 }
2621
2622 static void addReplyDouble(redisClient *c, double d) {
2623 char buf[128];
2624
2625 snprintf(buf,sizeof(buf),"%.17g",d);
2626 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2627 (unsigned long) strlen(buf),buf));
2628 }
2629
2630 static void addReplyLong(redisClient *c, long l) {
2631 char buf[128];
2632 size_t len;
2633
2634 if (l == 0) {
2635 addReply(c,shared.czero);
2636 return;
2637 } else if (l == 1) {
2638 addReply(c,shared.cone);
2639 return;
2640 }
2641 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2642 addReplySds(c,sdsnewlen(buf,len));
2643 }
2644
2645 static void addReplyLongLong(redisClient *c, long long ll) {
2646 char buf[128];
2647 size_t len;
2648
2649 if (ll == 0) {
2650 addReply(c,shared.czero);
2651 return;
2652 } else if (ll == 1) {
2653 addReply(c,shared.cone);
2654 return;
2655 }
2656 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2657 addReplySds(c,sdsnewlen(buf,len));
2658 }
2659
2660 static void addReplyUlong(redisClient *c, unsigned long ul) {
2661 char buf[128];
2662 size_t len;
2663
2664 if (ul == 0) {
2665 addReply(c,shared.czero);
2666 return;
2667 } else if (ul == 1) {
2668 addReply(c,shared.cone);
2669 return;
2670 }
2671 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2672 addReplySds(c,sdsnewlen(buf,len));
2673 }
2674
2675 static void addReplyBulkLen(redisClient *c, robj *obj) {
2676 size_t len;
2677
2678 if (obj->encoding == REDIS_ENCODING_RAW) {
2679 len = sdslen(obj->ptr);
2680 } else {
2681 long n = (long)obj->ptr;
2682
2683 /* Compute how many bytes will take this integer as a radix 10 string */
2684 len = 1;
2685 if (n < 0) {
2686 len++;
2687 n = -n;
2688 }
2689 while((n = n/10) != 0) {
2690 len++;
2691 }
2692 }
2693 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2694 }
2695
2696 static void addReplyBulk(redisClient *c, robj *obj) {
2697 addReplyBulkLen(c,obj);
2698 addReply(c,obj);
2699 addReply(c,shared.crlf);
2700 }
2701
2702 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2703 static void addReplyBulkCString(redisClient *c, char *s) {
2704 if (s == NULL) {
2705 addReply(c,shared.nullbulk);
2706 } else {
2707 robj *o = createStringObject(s,strlen(s));
2708 addReplyBulk(c,o);
2709 decrRefCount(o);
2710 }
2711 }
2712
2713 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2714 int cport, cfd;
2715 char cip[128];
2716 redisClient *c;
2717 REDIS_NOTUSED(el);
2718 REDIS_NOTUSED(mask);
2719 REDIS_NOTUSED(privdata);
2720
2721 cfd = anetAccept(server.neterr, fd, cip, &cport);
2722 if (cfd == AE_ERR) {
2723 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2724 return;
2725 }
2726 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2727 if ((c = createClient(cfd)) == NULL) {
2728 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2729 close(cfd); /* May be already closed, just ingore errors */
2730 return;
2731 }
2732 /* If maxclient directive is set and this is one client more... close the
2733 * connection. Note that we create the client instead to check before
2734 * for this condition, since now the socket is already set in nonblocking
2735 * mode and we can send an error for free using the Kernel I/O */
2736 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2737 char *err = "-ERR max number of clients reached\r\n";
2738
2739 /* That's a best effort error message, don't check write errors */
2740 if (write(c->fd,err,strlen(err)) == -1) {
2741 /* Nothing to do, Just to avoid the warning... */
2742 }
2743 freeClient(c);
2744 return;
2745 }
2746 server.stat_numconnections++;
2747 }
2748
2749 /* ======================= Redis objects implementation ===================== */
2750
2751 static robj *createObject(int type, void *ptr) {
2752 robj *o;
2753
2754 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2755 if (listLength(server.objfreelist)) {
2756 listNode *head = listFirst(server.objfreelist);
2757 o = listNodeValue(head);
2758 listDelNode(server.objfreelist,head);
2759 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2760 } else {
2761 if (server.vm_enabled) {
2762 pthread_mutex_unlock(&server.obj_freelist_mutex);
2763 o = zmalloc(sizeof(*o));
2764 } else {
2765 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2766 }
2767 }
2768 o->type = type;
2769 o->encoding = REDIS_ENCODING_RAW;
2770 o->ptr = ptr;
2771 o->refcount = 1;
2772 if (server.vm_enabled) {
2773 /* Note that this code may run in the context of an I/O thread
2774 * and accessing to server.unixtime in theory is an error
2775 * (no locks). But in practice this is safe, and even if we read
2776 * garbage Redis will not fail, as it's just a statistical info */
2777 o->vm.atime = server.unixtime;
2778 o->storage = REDIS_VM_MEMORY;
2779 }
2780 return o;
2781 }
2782
2783 static robj *createStringObject(char *ptr, size_t len) {
2784 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2785 }
2786
2787 static robj *createStringObjectFromLongLong(long long value) {
2788 robj *o;
2789 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2790 incrRefCount(shared.integers[value]);
2791 o = shared.integers[value];
2792 } else {
2793 o = createObject(REDIS_STRING, NULL);
2794 if (value >= LONG_MIN && value <= LONG_MAX) {
2795 o->encoding = REDIS_ENCODING_INT;
2796 o->ptr = (void*)((long)value);
2797 } else {
2798 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2799 }
2800 }
2801 return o;
2802 }
2803
2804 static robj *dupStringObject(robj *o) {
2805 assert(o->encoding == REDIS_ENCODING_RAW);
2806 return createStringObject(o->ptr,sdslen(o->ptr));
2807 }
2808
2809 static robj *createListObject(void) {
2810 list *l = listCreate();
2811
2812 listSetFreeMethod(l,decrRefCount);
2813 return createObject(REDIS_LIST,l);
2814 }
2815
2816 static robj *createSetObject(void) {
2817 dict *d = dictCreate(&setDictType,NULL);
2818 return createObject(REDIS_SET,d);
2819 }
2820
2821 static robj *createHashObject(void) {
2822 /* All the Hashes start as zipmaps. Will be automatically converted
2823 * into hash tables if there are enough elements or big elements
2824 * inside. */
2825 unsigned char *zm = zipmapNew();
2826 robj *o = createObject(REDIS_HASH,zm);
2827 o->encoding = REDIS_ENCODING_ZIPMAP;
2828 return o;
2829 }
2830
2831 static robj *createZsetObject(void) {
2832 zset *zs = zmalloc(sizeof(*zs));
2833
2834 zs->dict = dictCreate(&zsetDictType,NULL);
2835 zs->zsl = zslCreate();
2836 return createObject(REDIS_ZSET,zs);
2837 }
2838
2839 static void freeStringObject(robj *o) {
2840 if (o->encoding == REDIS_ENCODING_RAW) {
2841 sdsfree(o->ptr);
2842 }
2843 }
2844
2845 static void freeListObject(robj *o) {
2846 listRelease((list*) o->ptr);
2847 }
2848
2849 static void freeSetObject(robj *o) {
2850 dictRelease((dict*) o->ptr);
2851 }
2852
2853 static void freeZsetObject(robj *o) {
2854 zset *zs = o->ptr;
2855
2856 dictRelease(zs->dict);
2857 zslFree(zs->zsl);
2858 zfree(zs);
2859 }
2860
2861 static void freeHashObject(robj *o) {
2862 switch (o->encoding) {
2863 case REDIS_ENCODING_HT:
2864 dictRelease((dict*) o->ptr);
2865 break;
2866 case REDIS_ENCODING_ZIPMAP:
2867 zfree(o->ptr);
2868 break;
2869 default:
2870 redisPanic("Unknown hash encoding type");
2871 break;
2872 }
2873 }
2874
2875 static void incrRefCount(robj *o) {
2876 o->refcount++;
2877 }
2878
2879 static void decrRefCount(void *obj) {
2880 robj *o = obj;
2881
2882 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
2883 /* Object is a key of a swapped out value, or in the process of being
2884 * loaded. */
2885 if (server.vm_enabled &&
2886 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2887 {
2888 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2889 redisAssert(o->type == REDIS_STRING);
2890 freeStringObject(o);
2891 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2892 pthread_mutex_lock(&server.obj_freelist_mutex);
2893 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2894 !listAddNodeHead(server.objfreelist,o))
2895 zfree(o);
2896 pthread_mutex_unlock(&server.obj_freelist_mutex);
2897 server.vm_stats_swapped_objects--;
2898 return;
2899 }
2900 /* Object is in memory, or in the process of being swapped out. */
2901 if (--(o->refcount) == 0) {
2902 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2903 vmCancelThreadedIOJob(obj);
2904 switch(o->type) {
2905 case REDIS_STRING: freeStringObject(o); break;
2906 case REDIS_LIST: freeListObject(o); break;
2907 case REDIS_SET: freeSetObject(o); break;
2908 case REDIS_ZSET: freeZsetObject(o); break;
2909 case REDIS_HASH: freeHashObject(o); break;
2910 default: redisPanic("Unknown object type"); break;
2911 }
2912 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2913 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2914 !listAddNodeHead(server.objfreelist,o))
2915 zfree(o);
2916 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2917 }
2918 }
2919
2920 static robj *lookupKey(redisDb *db, robj *key) {
2921 dictEntry *de = dictFind(db->dict,key);
2922 if (de) {
2923 robj *key = dictGetEntryKey(de);
2924 robj *val = dictGetEntryVal(de);
2925
2926 if (server.vm_enabled) {
2927 if (key->storage == REDIS_VM_MEMORY ||
2928 key->storage == REDIS_VM_SWAPPING)
2929 {
2930 /* If we were swapping the object out, stop it, this key
2931 * was requested. */
2932 if (key->storage == REDIS_VM_SWAPPING)
2933 vmCancelThreadedIOJob(key);
2934 /* Update the access time of the key for the aging algorithm. */
2935 key->vm.atime = server.unixtime;
2936 } else {
2937 int notify = (key->storage == REDIS_VM_LOADING);
2938
2939 /* Our value was swapped on disk. Bring it at home. */
2940 redisAssert(val == NULL);
2941 val = vmLoadObject(key);
2942 dictGetEntryVal(de) = val;
2943
2944 /* Clients blocked by the VM subsystem may be waiting for
2945 * this key... */
2946 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2947 }
2948 }
2949 return val;
2950 } else {
2951 return NULL;
2952 }
2953 }
2954
2955 static robj *lookupKeyRead(redisDb *db, robj *key) {
2956 expireIfNeeded(db,key);
2957 return lookupKey(db,key);
2958 }
2959
2960 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2961 deleteIfVolatile(db,key);
2962 return lookupKey(db,key);
2963 }
2964
2965 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2966 robj *o = lookupKeyRead(c->db, key);
2967 if (!o) addReply(c,reply);
2968 return o;
2969 }
2970
2971 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2972 robj *o = lookupKeyWrite(c->db, key);
2973 if (!o) addReply(c,reply);
2974 return o;
2975 }
2976
2977 static int checkType(redisClient *c, robj *o, int type) {
2978 if (o->type != type) {
2979 addReply(c,shared.wrongtypeerr);
2980 return 1;
2981 }
2982 return 0;
2983 }
2984
2985 static int deleteKey(redisDb *db, robj *key) {
2986 int retval;
2987
2988 /* We need to protect key from destruction: after the first dictDelete()
2989 * it may happen that 'key' is no longer valid if we don't increment
2990 * it's count. This may happen when we get the object reference directly
2991 * from the hash table with dictRandomKey() or dict iterators */
2992 incrRefCount(key);
2993 if (dictSize(db->expires)) dictDelete(db->expires,key);
2994 retval = dictDelete(db->dict,key);
2995 decrRefCount(key);
2996
2997 return retval == DICT_OK;
2998 }
2999
3000 /* Check if the nul-terminated string 's' can be represented by a long
3001 * (that is, is a number that fits into long without any other space or
3002 * character before or after the digits).
3003 *
3004 * If so, the function returns REDIS_OK and *longval is set to the value
3005 * of the number. Otherwise REDIS_ERR is returned */
3006 static int isStringRepresentableAsLong(sds s, long *longval) {
3007 char buf[32], *endptr;
3008 long value;
3009 int slen;
3010
3011 value = strtol(s, &endptr, 10);
3012 if (endptr[0] != '\0') return REDIS_ERR;
3013 slen = snprintf(buf,32,"%ld",value);
3014
3015 /* If the number converted back into a string is not identical
3016 * then it's not possible to encode the string as integer */
3017 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
3018 if (longval) *longval = value;
3019 return REDIS_OK;
3020 }
3021
3022 /* Try to encode a string object in order to save space */
3023 static robj *tryObjectEncoding(robj *o) {
3024 long value;
3025 sds s = o->ptr;
3026
3027 if (o->encoding != REDIS_ENCODING_RAW)
3028 return o; /* Already encoded */
3029
3030 /* It's not safe to encode shared objects: shared objects can be shared
3031 * everywhere in the "object space" of Redis. Encoded objects can only
3032 * appear as "values" (and not, for instance, as keys) */
3033 if (o->refcount > 1) return o;
3034
3035 /* Currently we try to encode only strings */
3036 redisAssert(o->type == REDIS_STRING);
3037
3038 /* Check if we can represent this string as a long integer */
3039 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
3040
3041 /* Ok, this object can be encoded */
3042 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3043 decrRefCount(o);
3044 incrRefCount(shared.integers[value]);
3045 return shared.integers[value];
3046 } else {
3047 o->encoding = REDIS_ENCODING_INT;
3048 sdsfree(o->ptr);
3049 o->ptr = (void*) value;
3050 return o;
3051 }
3052 }
3053
3054 /* Get a decoded version of an encoded object (returned as a new object).
3055 * If the object is already raw-encoded just increment the ref count. */
3056 static robj *getDecodedObject(robj *o) {
3057 robj *dec;
3058
3059 if (o->encoding == REDIS_ENCODING_RAW) {
3060 incrRefCount(o);
3061 return o;
3062 }
3063 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3064 char buf[32];
3065
3066 snprintf(buf,32,"%ld",(long)o->ptr);
3067 dec = createStringObject(buf,strlen(buf));
3068 return dec;
3069 } else {
3070 redisPanic("Unknown encoding type");
3071 }
3072 }
3073
3074 /* Compare two string objects via strcmp() or alike.
3075 * Note that the objects may be integer-encoded. In such a case we
3076 * use snprintf() to get a string representation of the numbers on the stack
3077 * and compare the strings, it's much faster than calling getDecodedObject().
3078 *
3079 * Important note: if objects are not integer encoded, but binary-safe strings,
3080 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3081 * binary safe. */
3082 static int compareStringObjects(robj *a, robj *b) {
3083 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3084 char bufa[128], bufb[128], *astr, *bstr;
3085 int bothsds = 1;
3086
3087 if (a == b) return 0;
3088 if (a->encoding != REDIS_ENCODING_RAW) {
3089 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3090 astr = bufa;
3091 bothsds = 0;
3092 } else {
3093 astr = a->ptr;
3094 }
3095 if (b->encoding != REDIS_ENCODING_RAW) {
3096 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3097 bstr = bufb;
3098 bothsds = 0;
3099 } else {
3100 bstr = b->ptr;
3101 }
3102 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3103 }
3104
3105 static size_t stringObjectLen(robj *o) {
3106 redisAssert(o->type == REDIS_STRING);
3107 if (o->encoding == REDIS_ENCODING_RAW) {
3108 return sdslen(o->ptr);
3109 } else {
3110 char buf[32];
3111
3112 return snprintf(buf,32,"%ld",(long)o->ptr);
3113 }
3114 }
3115
3116 static int getDoubleFromObject(robj *o, double *target) {
3117 double value;
3118 char *eptr;
3119
3120 if (o == NULL) {
3121 value = 0;
3122 } else {
3123 redisAssert(o->type == REDIS_STRING);
3124 if (o->encoding == REDIS_ENCODING_RAW) {
3125 value = strtod(o->ptr, &eptr);
3126 if (eptr[0] != '\0') return REDIS_ERR;
3127 } else if (o->encoding == REDIS_ENCODING_INT) {
3128 value = (long)o->ptr;
3129 } else {
3130 redisAssert(1 != 1);
3131 }
3132 }
3133
3134 *target = value;
3135 return REDIS_OK;
3136 }
3137
3138 static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3139 double value;
3140 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3141 if (msg != NULL) {
3142 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3143 } else {
3144 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3145 }
3146 return REDIS_ERR;
3147 }
3148
3149 *target = value;
3150 return REDIS_OK;
3151 }
3152
3153 static int getLongLongFromObject(robj *o, long long *target) {
3154 long long value;
3155 char *eptr;
3156
3157 if (o == NULL) {
3158 value = 0;
3159 } else {
3160 redisAssert(o->type == REDIS_STRING);
3161 if (o->encoding == REDIS_ENCODING_RAW) {
3162 value = strtoll(o->ptr, &eptr, 10);
3163 if (eptr[0] != '\0') return REDIS_ERR;
3164 } else if (o->encoding == REDIS_ENCODING_INT) {
3165 value = (long)o->ptr;
3166 } else {
3167 redisAssert(1 != 1);
3168 }
3169 }
3170
3171 *target = value;
3172 return REDIS_OK;
3173 }
3174
3175 static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3176 long long value;
3177 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3178 if (msg != NULL) {
3179 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3180 } else {
3181 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3182 }
3183 return REDIS_ERR;
3184 }
3185
3186 *target = value;
3187 return REDIS_OK;
3188 }
3189
3190 static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3191 long long value;
3192
3193 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3194 if (value < LONG_MIN || value > LONG_MAX) {
3195 if (msg != NULL) {
3196 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3197 } else {
3198 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3199 }
3200 return REDIS_ERR;
3201 }
3202
3203 *target = value;
3204 return REDIS_OK;
3205 }
3206
3207 /*============================ RDB saving/loading =========================== */
3208
3209 static int rdbSaveType(FILE *fp, unsigned char type) {
3210 if (fwrite(&type,1,1,fp) == 0) return -1;
3211 return 0;
3212 }
3213
3214 static int rdbSaveTime(FILE *fp, time_t t) {
3215 int32_t t32 = (int32_t) t;
3216 if (fwrite(&t32,4,1,fp) == 0) return -1;
3217 return 0;
3218 }
3219
3220 /* check rdbLoadLen() comments for more info */
3221 static int rdbSaveLen(FILE *fp, uint32_t len) {
3222 unsigned char buf[2];
3223
3224 if (len < (1<<6)) {
3225 /* Save a 6 bit len */
3226 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3227 if (fwrite(buf,1,1,fp) == 0) return -1;
3228 } else if (len < (1<<14)) {
3229 /* Save a 14 bit len */
3230 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3231 buf[1] = len&0xFF;
3232 if (fwrite(buf,2,1,fp) == 0) return -1;
3233 } else {
3234 /* Save a 32 bit len */
3235 buf[0] = (REDIS_RDB_32BITLEN<<6);
3236 if (fwrite(buf,1,1,fp) == 0) return -1;
3237 len = htonl(len);
3238 if (fwrite(&len,4,1,fp) == 0) return -1;
3239 }
3240 return 0;
3241 }
3242
3243 /* String objects in the form "2391" "-100" without any space and with a
3244 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3245 * encoded as integers to save space */
3246 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3247 long long value;
3248 char *endptr, buf[32];
3249
3250 /* Check if it's possible to encode this value as a number */
3251 value = strtoll(s, &endptr, 10);
3252 if (endptr[0] != '\0') return 0;
3253 snprintf(buf,32,"%lld",value);
3254
3255 /* If the number converted back into a string is not identical
3256 * then it's not possible to encode the string as integer */
3257 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3258
3259 /* Finally check if it fits in our ranges */
3260 if (value >= -(1<<7) && value <= (1<<7)-1) {
3261 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3262 enc[1] = value&0xFF;
3263 return 2;
3264 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3265 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3266 enc[1] = value&0xFF;
3267 enc[2] = (value>>8)&0xFF;
3268 return 3;
3269 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3270 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3271 enc[1] = value&0xFF;
3272 enc[2] = (value>>8)&0xFF;
3273 enc[3] = (value>>16)&0xFF;
3274 enc[4] = (value>>24)&0xFF;
3275 return 5;
3276 } else {
3277 return 0;
3278 }
3279 }
3280
3281 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3282 size_t comprlen, outlen;
3283 unsigned char byte;
3284 void *out;
3285
3286 /* We require at least four bytes compression for this to be worth it */
3287 if (len <= 4) return 0;
3288 outlen = len-4;
3289 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3290 comprlen = lzf_compress(s, len, out, outlen);
3291 if (comprlen == 0) {
3292 zfree(out);
3293 return 0;
3294 }
3295 /* Data compressed! Let's save it on disk */
3296 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3297 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3298 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3299 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3300 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3301 zfree(out);
3302 return comprlen;
3303
3304 writeerr:
3305 zfree(out);
3306 return -1;
3307 }
3308
3309 /* Save a string objet as [len][data] on disk. If the object is a string
3310 * representation of an integer value we try to safe it in a special form */
3311 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3312 int enclen;
3313
3314 /* Try integer encoding */
3315 if (len <= 11) {
3316 unsigned char buf[5];
3317 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3318 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3319 return 0;
3320 }
3321 }
3322
3323 /* Try LZF compression - under 20 bytes it's unable to compress even
3324 * aaaaaaaaaaaaaaaaaa so skip it */
3325 if (server.rdbcompression && len > 20) {
3326 int retval;
3327
3328 retval = rdbSaveLzfStringObject(fp,s,len);
3329 if (retval == -1) return -1;
3330 if (retval > 0) return 0;
3331 /* retval == 0 means data can't be compressed, save the old way */
3332 }
3333
3334 /* Store verbatim */
3335 if (rdbSaveLen(fp,len) == -1) return -1;
3336 if (len && fwrite(s,len,1,fp) == 0) return -1;
3337 return 0;
3338 }
3339
3340 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3341 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3342 int retval;
3343
3344 /* Avoid incr/decr ref count business when possible.
3345 * This plays well with copy-on-write given that we are probably
3346 * in a child process (BGSAVE). Also this makes sure key objects
3347 * of swapped objects are not incRefCount-ed (an assert does not allow
3348 * this in order to avoid bugs) */
3349 if (obj->encoding != REDIS_ENCODING_RAW) {
3350 obj = getDecodedObject(obj);
3351 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3352 decrRefCount(obj);
3353 } else {
3354 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3355 }
3356 return retval;
3357 }
3358
3359 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3360 * 8 bit integer specifing the length of the representation.
3361 * This 8 bit integer has special values in order to specify the following
3362 * conditions:
3363 * 253: not a number
3364 * 254: + inf
3365 * 255: - inf
3366 */
3367 static int rdbSaveDoubleValue(FILE *fp, double val) {
3368 unsigned char buf[128];
3369 int len;
3370
3371 if (isnan(val)) {
3372 buf[0] = 253;
3373 len = 1;
3374 } else if (!isfinite(val)) {
3375 len = 1;
3376 buf[0] = (val < 0) ? 255 : 254;
3377 } else {
3378 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3379 buf[0] = strlen((char*)buf+1);
3380 len = buf[0]+1;
3381 }
3382 if (fwrite(buf,len,1,fp) == 0) return -1;
3383 return 0;
3384 }
3385
3386 /* Save a Redis object. */
3387 static int rdbSaveObject(FILE *fp, robj *o) {
3388 if (o->type == REDIS_STRING) {
3389 /* Save a string value */
3390 if (rdbSaveStringObject(fp,o) == -1) return -1;
3391 } else if (o->type == REDIS_LIST) {
3392 /* Save a list value */
3393 list *list = o->ptr;
3394 listIter li;
3395 listNode *ln;
3396
3397 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3398 listRewind(list,&li);
3399 while((ln = listNext(&li))) {
3400 robj *eleobj = listNodeValue(ln);
3401
3402 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3403 }
3404 } else if (o->type == REDIS_SET) {
3405 /* Save a set value */
3406 dict *set = o->ptr;
3407 dictIterator *di = dictGetIterator(set);
3408 dictEntry *de;
3409
3410 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3411 while((de = dictNext(di)) != NULL) {
3412 robj *eleobj = dictGetEntryKey(de);
3413
3414 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3415 }
3416 dictReleaseIterator(di);
3417 } else if (o->type == REDIS_ZSET) {
3418 /* Save a set value */
3419 zset *zs = o->ptr;
3420 dictIterator *di = dictGetIterator(zs->dict);
3421 dictEntry *de;
3422
3423 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3424 while((de = dictNext(di)) != NULL) {
3425 robj *eleobj = dictGetEntryKey(de);
3426 double *score = dictGetEntryVal(de);
3427
3428 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3429 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3430 }
3431 dictReleaseIterator(di);
3432 } else if (o->type == REDIS_HASH) {
3433 /* Save a hash value */
3434 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3435 unsigned char *p = zipmapRewind(o->ptr);
3436 unsigned int count = zipmapLen(o->ptr);
3437 unsigned char *key, *val;
3438 unsigned int klen, vlen;
3439
3440 if (rdbSaveLen(fp,count) == -1) return -1;
3441 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3442 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3443 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3444 }
3445 } else {
3446 dictIterator *di = dictGetIterator(o->ptr);
3447 dictEntry *de;
3448
3449 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3450 while((de = dictNext(di)) != NULL) {
3451 robj *key = dictGetEntryKey(de);
3452 robj *val = dictGetEntryVal(de);
3453
3454 if (rdbSaveStringObject(fp,key) == -1) return -1;
3455 if (rdbSaveStringObject(fp,val) == -1) return -1;
3456 }
3457 dictReleaseIterator(di);
3458 }
3459 } else {
3460 redisPanic("Unknown object type");
3461 }
3462 return 0;
3463 }
3464
3465 /* Return the length the object will have on disk if saved with
3466 * the rdbSaveObject() function. Currently we use a trick to get
3467 * this length with very little changes to the code. In the future
3468 * we could switch to a faster solution. */
3469 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3470 if (fp == NULL) fp = server.devnull;
3471 rewind(fp);
3472 assert(rdbSaveObject(fp,o) != 1);
3473 return ftello(fp);
3474 }
3475
3476 /* Return the number of pages required to save this object in the swap file */
3477 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3478 off_t bytes = rdbSavedObjectLen(o,fp);
3479
3480 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3481 }
3482
3483 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3484 static int rdbSave(char *filename) {
3485 dictIterator *di = NULL;
3486 dictEntry *de;
3487 FILE *fp;
3488 char tmpfile[256];
3489 int j;
3490 time_t now = time(NULL);
3491
3492 /* Wait for I/O therads to terminate, just in case this is a
3493 * foreground-saving, to avoid seeking the swap file descriptor at the
3494 * same time. */
3495 if (server.vm_enabled)
3496 waitEmptyIOJobsQueue();
3497
3498 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3499 fp = fopen(tmpfile,"w");
3500 if (!fp) {
3501 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3502 return REDIS_ERR;
3503 }
3504 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3505 for (j = 0; j < server.dbnum; j++) {
3506 redisDb *db = server.db+j;
3507 dict *d = db->dict;
3508 if (dictSize(d) == 0) continue;
3509 di = dictGetIterator(d);
3510 if (!di) {
3511 fclose(fp);
3512 return REDIS_ERR;
3513 }
3514
3515 /* Write the SELECT DB opcode */
3516 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3517 if (rdbSaveLen(fp,j) == -1) goto werr;
3518
3519 /* Iterate this DB writing every entry */
3520 while((de = dictNext(di)) != NULL) {
3521 robj *key = dictGetEntryKey(de);
3522 robj *o = dictGetEntryVal(de);
3523 time_t expiretime = getExpire(db,key);
3524
3525 /* Save the expire time */
3526 if (expiretime != -1) {
3527 /* If this key is already expired skip it */
3528 if (expiretime < now) continue;
3529 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3530 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3531 }
3532 /* Save the key and associated value. This requires special
3533 * handling if the value is swapped out. */
3534 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3535 key->storage == REDIS_VM_SWAPPING) {
3536 /* Save type, key, value */
3537 if (rdbSaveType(fp,o->type) == -1) goto werr;
3538 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3539 if (rdbSaveObject(fp,o) == -1) goto werr;
3540 } else {
3541 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3542 robj *po;
3543 /* Get a preview of the object in memory */
3544 po = vmPreviewObject(key);
3545 /* Save type, key, value */
3546 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3547 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3548 if (rdbSaveObject(fp,po) == -1) goto werr;
3549 /* Remove the loaded object from memory */
3550 decrRefCount(po);
3551 }
3552 }
3553 dictReleaseIterator(di);
3554 }
3555 /* EOF opcode */
3556 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3557
3558 /* Make sure data will not remain on the OS's output buffers */
3559 fflush(fp);
3560 fsync(fileno(fp));
3561 fclose(fp);
3562
3563 /* Use RENAME to make sure the DB file is changed atomically only
3564 * if the generate DB file is ok. */
3565 if (rename(tmpfile,filename) == -1) {
3566 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3567 unlink(tmpfile);
3568 return REDIS_ERR;
3569 }
3570 redisLog(REDIS_NOTICE,"DB saved on disk");
3571 server.dirty = 0;
3572 server.lastsave = time(NULL);
3573 return REDIS_OK;
3574
3575 werr:
3576 fclose(fp);
3577 unlink(tmpfile);
3578 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3579 if (di) dictReleaseIterator(di);
3580 return REDIS_ERR;
3581 }
3582
3583 static int rdbSaveBackground(char *filename) {
3584 pid_t childpid;
3585
3586 if (server.bgsavechildpid != -1) return REDIS_ERR;
3587 if (server.vm_enabled) waitEmptyIOJobsQueue();
3588 if ((childpid = fork()) == 0) {
3589 /* Child */
3590 if (server.vm_enabled) vmReopenSwapFile();
3591 close(server.fd);
3592 if (rdbSave(filename) == REDIS_OK) {
3593 _exit(0);
3594 } else {
3595 _exit(1);
3596 }
3597 } else {
3598 /* Parent */
3599 if (childpid == -1) {
3600 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3601 strerror(errno));
3602 return REDIS_ERR;
3603 }
3604 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3605 server.bgsavechildpid = childpid;
3606 updateDictResizePolicy();
3607 return REDIS_OK;
3608 }
3609 return REDIS_OK; /* unreached */
3610 }
3611
3612 static void rdbRemoveTempFile(pid_t childpid) {
3613 char tmpfile[256];
3614
3615 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3616 unlink(tmpfile);
3617 }
3618
3619 static int rdbLoadType(FILE *fp) {
3620 unsigned char type;
3621 if (fread(&type,1,1,fp) == 0) return -1;
3622 return type;
3623 }
3624
3625 static time_t rdbLoadTime(FILE *fp) {
3626 int32_t t32;
3627 if (fread(&t32,4,1,fp) == 0) return -1;
3628 return (time_t) t32;
3629 }
3630
3631 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3632 * of this file for a description of how this are stored on disk.
3633 *
3634 * isencoded is set to 1 if the readed length is not actually a length but
3635 * an "encoding type", check the above comments for more info */
3636 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3637 unsigned char buf[2];
3638 uint32_t len;
3639 int type;
3640
3641 if (isencoded) *isencoded = 0;
3642 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3643 type = (buf[0]&0xC0)>>6;
3644 if (type == REDIS_RDB_6BITLEN) {
3645 /* Read a 6 bit len */
3646 return buf[0]&0x3F;
3647 } else if (type == REDIS_RDB_ENCVAL) {
3648 /* Read a 6 bit len encoding type */
3649 if (isencoded) *isencoded = 1;
3650 return buf[0]&0x3F;
3651 } else if (type == REDIS_RDB_14BITLEN) {
3652 /* Read a 14 bit len */
3653 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3654 return ((buf[0]&0x3F)<<8)|buf[1];
3655 } else {
3656 /* Read a 32 bit len */
3657 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3658 return ntohl(len);
3659 }
3660 }
3661
3662 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3663 unsigned char enc[4];
3664 long long val;
3665
3666 if (enctype == REDIS_RDB_ENC_INT8) {
3667 if (fread(enc,1,1,fp) == 0) return NULL;
3668 val = (signed char)enc[0];
3669 } else if (enctype == REDIS_RDB_ENC_INT16) {
3670 uint16_t v;
3671 if (fread(enc,2,1,fp) == 0) return NULL;
3672 v = enc[0]|(enc[1]<<8);
3673 val = (int16_t)v;
3674 } else if (enctype == REDIS_RDB_ENC_INT32) {
3675 uint32_t v;
3676 if (fread(enc,4,1,fp) == 0) return NULL;
3677 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3678 val = (int32_t)v;
3679 } else {
3680 val = 0; /* anti-warning */
3681 redisPanic("Unknown RDB integer encoding type");
3682 }
3683 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3684 }
3685
3686 static robj *rdbLoadLzfStringObject(FILE*fp) {
3687 unsigned int len, clen;
3688 unsigned char *c = NULL;
3689 sds val = NULL;
3690
3691 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3692 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3693 if ((c = zmalloc(clen)) == NULL) goto err;
3694 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3695 if (fread(c,clen,1,fp) == 0) goto err;
3696 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3697 zfree(c);
3698 return createObject(REDIS_STRING,val);
3699 err:
3700 zfree(c);
3701 sdsfree(val);
3702 return NULL;
3703 }
3704
3705 static robj *rdbLoadStringObject(FILE*fp) {
3706 int isencoded;
3707 uint32_t len;
3708 sds val;
3709
3710 len = rdbLoadLen(fp,&isencoded);
3711 if (isencoded) {
3712 switch(len) {
3713 case REDIS_RDB_ENC_INT8:
3714 case REDIS_RDB_ENC_INT16:
3715 case REDIS_RDB_ENC_INT32:
3716 return rdbLoadIntegerObject(fp,len);
3717 case REDIS_RDB_ENC_LZF:
3718 return rdbLoadLzfStringObject(fp);
3719 default:
3720 redisPanic("Unknown RDB encoding type");
3721 }
3722 }
3723
3724 if (len == REDIS_RDB_LENERR) return NULL;
3725 val = sdsnewlen(NULL,len);
3726 if (len && fread(val,len,1,fp) == 0) {
3727 sdsfree(val);
3728 return NULL;
3729 }
3730 return createObject(REDIS_STRING,val);
3731 }
3732
3733 /* For information about double serialization check rdbSaveDoubleValue() */
3734 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3735 char buf[128];
3736 unsigned char len;
3737
3738 if (fread(&len,1,1,fp) == 0) return -1;
3739 switch(len) {
3740 case 255: *val = R_NegInf; return 0;
3741 case 254: *val = R_PosInf; return 0;
3742 case 253: *val = R_Nan; return 0;
3743 default:
3744 if (fread(buf,len,1,fp) == 0) return -1;
3745 buf[len] = '\0';
3746 sscanf(buf, "%lg", val);
3747 return 0;
3748 }
3749 }
3750
3751 /* Load a Redis object of the specified type from the specified file.
3752 * On success a newly allocated object is returned, otherwise NULL. */
3753 static robj *rdbLoadObject(int type, FILE *fp) {
3754 robj *o;
3755
3756 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3757 if (type == REDIS_STRING) {
3758 /* Read string value */
3759 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3760 o = tryObjectEncoding(o);
3761 } else if (type == REDIS_LIST || type == REDIS_SET) {
3762 /* Read list/set value */
3763 uint32_t listlen;
3764
3765 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3766 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3767 /* It's faster to expand the dict to the right size asap in order
3768 * to avoid rehashing */
3769 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3770 dictExpand(o->ptr,listlen);
3771 /* Load every single element of the list/set */
3772 while(listlen--) {
3773 robj *ele;
3774
3775 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3776 ele = tryObjectEncoding(ele);
3777 if (type == REDIS_LIST) {
3778 listAddNodeTail((list*)o->ptr,ele);
3779 } else {
3780 dictAdd((dict*)o->ptr,ele,NULL);
3781 }
3782 }
3783 } else if (type == REDIS_ZSET) {
3784 /* Read list/set value */
3785 size_t zsetlen;
3786 zset *zs;
3787
3788 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3789 o = createZsetObject();
3790 zs = o->ptr;
3791 /* Load every single element of the list/set */
3792 while(zsetlen--) {
3793 robj *ele;
3794 double *score = zmalloc(sizeof(double));
3795
3796 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3797 ele = tryObjectEncoding(ele);
3798 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3799 dictAdd(zs->dict,ele,score);
3800 zslInsert(zs->zsl,*score,ele);
3801 incrRefCount(ele); /* added to skiplist */
3802 }
3803 } else if (type == REDIS_HASH) {
3804 size_t hashlen;
3805
3806 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3807 o = createHashObject();
3808 /* Too many entries? Use an hash table. */
3809 if (hashlen > server.hash_max_zipmap_entries)
3810 convertToRealHash(o);
3811 /* Load every key/value, then set it into the zipmap or hash
3812 * table, as needed. */
3813 while(hashlen--) {
3814 robj *key, *val;
3815
3816 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3817 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3818 /* If we are using a zipmap and there are too big values
3819 * the object is converted to real hash table encoding. */
3820 if (o->encoding != REDIS_ENCODING_HT &&
3821 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3822 sdslen(val->ptr) > server.hash_max_zipmap_value))
3823 {
3824 convertToRealHash(o);
3825 }
3826
3827 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3828 unsigned char *zm = o->ptr;
3829
3830 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3831 val->ptr,sdslen(val->ptr),NULL);
3832 o->ptr = zm;
3833 decrRefCount(key);
3834 decrRefCount(val);
3835 } else {
3836 key = tryObjectEncoding(key);
3837 val = tryObjectEncoding(val);
3838 dictAdd((dict*)o->ptr,key,val);
3839 }
3840 }
3841 } else {
3842 redisPanic("Unknown object type");
3843 }
3844 return o;
3845 }
3846
3847 static int rdbLoad(char *filename) {
3848 FILE *fp;
3849 robj *keyobj = NULL;
3850 uint32_t dbid;
3851 int type, retval, rdbver;
3852 dict *d = server.db[0].dict;
3853 redisDb *db = server.db+0;
3854 char buf[1024];
3855 time_t expiretime = -1, now = time(NULL);
3856 long long loadedkeys = 0;
3857
3858 fp = fopen(filename,"r");
3859 if (!fp) return REDIS_ERR;
3860 if (fread(buf,9,1,fp) == 0) goto eoferr;
3861 buf[9] = '\0';
3862 if (memcmp(buf,"REDIS",5) != 0) {
3863 fclose(fp);
3864 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3865 return REDIS_ERR;
3866 }
3867 rdbver = atoi(buf+5);
3868 if (rdbver != 1) {
3869 fclose(fp);
3870 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3871 return REDIS_ERR;
3872 }
3873 while(1) {
3874 robj *o;
3875
3876 /* Read type. */
3877 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3878 if (type == REDIS_EXPIRETIME) {
3879 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3880 /* We read the time so we need to read the object type again */
3881 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3882 }
3883 if (type == REDIS_EOF) break;
3884 /* Handle SELECT DB opcode as a special case */
3885 if (type == REDIS_SELECTDB) {
3886 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3887 goto eoferr;
3888 if (dbid >= (unsigned)server.dbnum) {
3889 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3890 exit(1);
3891 }
3892 db = server.db+dbid;
3893 d = db->dict;
3894 continue;
3895 }
3896 /* Read key */
3897 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3898 /* Read value */
3899 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3900 /* Add the new object in the hash table */
3901 retval = dictAdd(d,keyobj,o);
3902 if (retval == DICT_ERR) {
3903 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3904 exit(1);
3905 }
3906 /* Set the expire time if needed */
3907 if (expiretime != -1) {
3908 setExpire(db,keyobj,expiretime);
3909 /* Delete this key if already expired */
3910 if (expiretime < now) deleteKey(db,keyobj);
3911 expiretime = -1;
3912 }
3913 keyobj = o = NULL;
3914 /* Handle swapping while loading big datasets when VM is on */
3915 loadedkeys++;
3916 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3917 while (zmalloc_used_memory() > server.vm_max_memory) {
3918 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3919 }
3920 }
3921 }
3922 fclose(fp);
3923 return REDIS_OK;
3924
3925 eoferr: /* unexpected end of file is handled here with a fatal exit */
3926 if (keyobj) decrRefCount(keyobj);
3927 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3928 exit(1);
3929 return REDIS_ERR; /* Just to avoid warning */
3930 }
3931
3932 /*================================== Commands =============================== */
3933
3934 static void authCommand(redisClient *c) {
3935 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3936 c->authenticated = 1;
3937 addReply(c,shared.ok);
3938 } else {
3939 c->authenticated = 0;
3940 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3941 }
3942 }
3943
3944 static void pingCommand(redisClient *c) {
3945 addReply(c,shared.pong);
3946 }
3947
3948 static void echoCommand(redisClient *c) {
3949 addReplyBulk(c,c->argv[1]);
3950 }
3951
3952 /*=================================== Strings =============================== */
3953
3954 static void setGenericCommand(redisClient *c, int nx) {
3955 int retval;
3956
3957 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3958 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3959 if (retval == DICT_ERR) {
3960 if (!nx) {
3961 /* If the key is about a swapped value, we want a new key object
3962 * to overwrite the old. So we delete the old key in the database.
3963 * This will also make sure that swap pages about the old object
3964 * will be marked as free. */
3965 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3966 incrRefCount(c->argv[1]);
3967 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3968 incrRefCount(c->argv[2]);
3969 } else {
3970 addReply(c,shared.czero);
3971 return;
3972 }
3973 } else {
3974 incrRefCount(c->argv[1]);
3975 incrRefCount(c->argv[2]);
3976 }
3977 server.dirty++;
3978 removeExpire(c->db,c->argv[1]);
3979 addReply(c, nx ? shared.cone : shared.ok);
3980 }
3981
3982 static void setCommand(redisClient *c) {
3983 setGenericCommand(c,0);
3984 }
3985
3986 static void setnxCommand(redisClient *c) {
3987 setGenericCommand(c,1);
3988 }
3989
3990 static int getGenericCommand(redisClient *c) {
3991 robj *o;
3992
3993 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3994 return REDIS_OK;
3995
3996 if (o->type != REDIS_STRING) {
3997 addReply(c,shared.wrongtypeerr);
3998 return REDIS_ERR;
3999 } else {
4000 addReplyBulk(c,o);
4001 return REDIS_OK;
4002 }
4003 }
4004
4005 static void getCommand(redisClient *c) {
4006 getGenericCommand(c);
4007 }
4008
4009 static void getsetCommand(redisClient *c) {
4010 if (getGenericCommand(c) == REDIS_ERR) return;
4011 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4012 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4013 } else {
4014 incrRefCount(c->argv[1]);
4015 }
4016 incrRefCount(c->argv[2]);
4017 server.dirty++;
4018 removeExpire(c->db,c->argv[1]);
4019 }
4020
4021 static void mgetCommand(redisClient *c) {
4022 int j;
4023
4024 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
4025 for (j = 1; j < c->argc; j++) {
4026 robj *o = lookupKeyRead(c->db,c->argv[j]);
4027 if (o == NULL) {
4028 addReply(c,shared.nullbulk);
4029 } else {
4030 if (o->type != REDIS_STRING) {
4031 addReply(c,shared.nullbulk);
4032 } else {
4033 addReplyBulk(c,o);
4034 }
4035 }
4036 }
4037 }
4038
4039 static void msetGenericCommand(redisClient *c, int nx) {
4040 int j, busykeys = 0;
4041
4042 if ((c->argc % 2) == 0) {
4043 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
4044 return;
4045 }
4046 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4047 * set nothing at all if at least one already key exists. */
4048 if (nx) {
4049 for (j = 1; j < c->argc; j += 2) {
4050 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4051 busykeys++;
4052 }
4053 }
4054 }
4055 if (busykeys) {
4056 addReply(c, shared.czero);
4057 return;
4058 }
4059
4060 for (j = 1; j < c->argc; j += 2) {
4061 int retval;
4062
4063 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
4064 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4065 if (retval == DICT_ERR) {
4066 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4067 incrRefCount(c->argv[j+1]);
4068 } else {
4069 incrRefCount(c->argv[j]);
4070 incrRefCount(c->argv[j+1]);
4071 }
4072 removeExpire(c->db,c->argv[j]);
4073 }
4074 server.dirty += (c->argc-1)/2;
4075 addReply(c, nx ? shared.cone : shared.ok);
4076 }
4077
4078 static void msetCommand(redisClient *c) {
4079 msetGenericCommand(c,0);
4080 }
4081
4082 static void msetnxCommand(redisClient *c) {
4083 msetGenericCommand(c,1);
4084 }
4085
4086 static void incrDecrCommand(redisClient *c, long long incr) {
4087 long long value;
4088 int retval;
4089 robj *o;
4090
4091 o = lookupKeyWrite(c->db,c->argv[1]);
4092
4093 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
4094
4095 value += incr;
4096 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4097 o = tryObjectEncoding(o);
4098 retval = dictAdd(c->db->dict,c->argv[1],o);
4099 if (retval == DICT_ERR) {
4100 dictReplace(c->db->dict,c->argv[1],o);
4101 removeExpire(c->db,c->argv[1]);
4102 } else {
4103 incrRefCount(c->argv[1]);
4104 }
4105 server.dirty++;
4106 addReply(c,shared.colon);
4107 addReply(c,o);
4108 addReply(c,shared.crlf);
4109 }
4110
4111 static void incrCommand(redisClient *c) {
4112 incrDecrCommand(c,1);
4113 }
4114
4115 static void decrCommand(redisClient *c) {
4116 incrDecrCommand(c,-1);
4117 }
4118
4119 static void incrbyCommand(redisClient *c) {
4120 long long incr;
4121
4122 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4123 incrDecrCommand(c,incr);
4124 }
4125
4126 static void decrbyCommand(redisClient *c) {
4127 long long incr;
4128
4129 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
4130 incrDecrCommand(c,-incr);
4131 }
4132
4133 static void appendCommand(redisClient *c) {
4134 int retval;
4135 size_t totlen;
4136 robj *o;
4137
4138 o = lookupKeyWrite(c->db,c->argv[1]);
4139 if (o == NULL) {
4140 /* Create the key */
4141 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4142 incrRefCount(c->argv[1]);
4143 incrRefCount(c->argv[2]);
4144 totlen = stringObjectLen(c->argv[2]);
4145 } else {
4146 dictEntry *de;
4147
4148 de = dictFind(c->db->dict,c->argv[1]);
4149 assert(de != NULL);
4150
4151 o = dictGetEntryVal(de);
4152 if (o->type != REDIS_STRING) {
4153 addReply(c,shared.wrongtypeerr);
4154 return;
4155 }
4156 /* If the object is specially encoded or shared we have to make
4157 * a copy */
4158 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4159 robj *decoded = getDecodedObject(o);
4160
4161 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4162 decrRefCount(decoded);
4163 dictReplace(c->db->dict,c->argv[1],o);
4164 }
4165 /* APPEND! */
4166 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4167 o->ptr = sdscatlen(o->ptr,
4168 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4169 } else {
4170 o->ptr = sdscatprintf(o->ptr, "%ld",
4171 (unsigned long) c->argv[2]->ptr);
4172 }
4173 totlen = sdslen(o->ptr);
4174 }
4175 server.dirty++;
4176 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4177 }
4178
4179 static void substrCommand(redisClient *c) {
4180 robj *o;
4181 long start = atoi(c->argv[2]->ptr);
4182 long end = atoi(c->argv[3]->ptr);
4183 size_t rangelen, strlen;
4184 sds range;
4185
4186 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4187 checkType(c,o,REDIS_STRING)) return;
4188
4189 o = getDecodedObject(o);
4190 strlen = sdslen(o->ptr);
4191
4192 /* convert negative indexes */
4193 if (start < 0) start = strlen+start;
4194 if (end < 0) end = strlen+end;
4195 if (start < 0) start = 0;
4196 if (end < 0) end = 0;
4197
4198 /* indexes sanity checks */
4199 if (start > end || (size_t)start >= strlen) {
4200 /* Out of range start or start > end result in null reply */
4201 addReply(c,shared.nullbulk);
4202 decrRefCount(o);
4203 return;
4204 }
4205 if ((size_t)end >= strlen) end = strlen-1;
4206 rangelen = (end-start)+1;
4207
4208 /* Return the result */
4209 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4210 range = sdsnewlen((char*)o->ptr+start,rangelen);
4211 addReplySds(c,range);
4212 addReply(c,shared.crlf);
4213 decrRefCount(o);
4214 }
4215
4216 /* ========================= Type agnostic commands ========================= */
4217
4218 static void delCommand(redisClient *c) {
4219 int deleted = 0, j;
4220
4221 for (j = 1; j < c->argc; j++) {
4222 if (deleteKey(c->db,c->argv[j])) {
4223 server.dirty++;
4224 deleted++;
4225 }
4226 }
4227 addReplyLong(c,deleted);
4228 }
4229
4230 static void existsCommand(redisClient *c) {
4231 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4232 }
4233
4234 static void selectCommand(redisClient *c) {
4235 int id = atoi(c->argv[1]->ptr);
4236
4237 if (selectDb(c,id) == REDIS_ERR) {
4238 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4239 } else {
4240 addReply(c,shared.ok);
4241 }
4242 }
4243
4244 static void randomkeyCommand(redisClient *c) {
4245 dictEntry *de;
4246
4247 while(1) {
4248 de = dictGetRandomKey(c->db->dict);
4249 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4250 }
4251 if (de == NULL) {
4252 addReply(c,shared.plus);
4253 addReply(c,shared.crlf);
4254 } else {
4255 addReply(c,shared.plus);
4256 addReply(c,dictGetEntryKey(de));
4257 addReply(c,shared.crlf);
4258 }
4259 }
4260
4261 static void keysCommand(redisClient *c) {
4262 dictIterator *di;
4263 dictEntry *de;
4264 sds pattern = c->argv[1]->ptr;
4265 int plen = sdslen(pattern);
4266 unsigned long numkeys = 0;
4267 robj *lenobj = createObject(REDIS_STRING,NULL);
4268
4269 di = dictGetIterator(c->db->dict);
4270 addReply(c,lenobj);
4271 decrRefCount(lenobj);
4272 while((de = dictNext(di)) != NULL) {
4273 robj *keyobj = dictGetEntryKey(de);
4274
4275 sds key = keyobj->ptr;
4276 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4277 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4278 if (expireIfNeeded(c->db,keyobj) == 0) {
4279 addReplyBulk(c,keyobj);
4280 numkeys++;
4281 }
4282 }
4283 }
4284 dictReleaseIterator(di);
4285 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4286 }
4287
4288 static void dbsizeCommand(redisClient *c) {
4289 addReplySds(c,
4290 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4291 }
4292
4293 static void lastsaveCommand(redisClient *c) {
4294 addReplySds(c,
4295 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4296 }
4297
4298 static void typeCommand(redisClient *c) {
4299 robj *o;
4300 char *type;
4301
4302 o = lookupKeyRead(c->db,c->argv[1]);
4303 if (o == NULL) {
4304 type = "+none";
4305 } else {
4306 switch(o->type) {
4307 case REDIS_STRING: type = "+string"; break;
4308 case REDIS_LIST: type = "+list"; break;
4309 case REDIS_SET: type = "+set"; break;
4310 case REDIS_ZSET: type = "+zset"; break;
4311 case REDIS_HASH: type = "+hash"; break;
4312 default: type = "+unknown"; break;
4313 }
4314 }
4315 addReplySds(c,sdsnew(type));
4316 addReply(c,shared.crlf);
4317 }
4318
4319 static void saveCommand(redisClient *c) {
4320 if (server.bgsavechildpid != -1) {
4321 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4322 return;
4323 }
4324 if (rdbSave(server.dbfilename) == REDIS_OK) {
4325 addReply(c,shared.ok);
4326 } else {
4327 addReply(c,shared.err);
4328 }
4329 }
4330
4331 static void bgsaveCommand(redisClient *c) {
4332 if (server.bgsavechildpid != -1) {
4333 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4334 return;
4335 }
4336 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4337 char *status = "+Background saving started\r\n";
4338 addReplySds(c,sdsnew(status));
4339 } else {
4340 addReply(c,shared.err);
4341 }
4342 }
4343
4344 static void shutdownCommand(redisClient *c) {
4345 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4346 /* Kill the saving child if there is a background saving in progress.
4347 We want to avoid race conditions, for instance our saving child may
4348 overwrite the synchronous saving did by SHUTDOWN. */
4349 if (server.bgsavechildpid != -1) {
4350 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4351 kill(server.bgsavechildpid,SIGKILL);
4352 rdbRemoveTempFile(server.bgsavechildpid);
4353 }
4354 if (server.appendonly) {
4355 /* Append only file: fsync() the AOF and exit */
4356 fsync(server.appendfd);
4357 if (server.vm_enabled) unlink(server.vm_swap_file);
4358 exit(0);
4359 } else {
4360 /* Snapshotting. Perform a SYNC SAVE and exit */
4361 if (rdbSave(server.dbfilename) == REDIS_OK) {
4362 if (server.daemonize)
4363 unlink(server.pidfile);
4364 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4365 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4366 if (server.vm_enabled) unlink(server.vm_swap_file);
4367 exit(0);
4368 } else {
4369 /* Ooops.. error saving! The best we can do is to continue
4370 * operating. Note that if there was a background saving process,
4371 * in the next cron() Redis will be notified that the background
4372 * saving aborted, handling special stuff like slaves pending for
4373 * synchronization... */
4374 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4375 addReplySds(c,
4376 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4377 }
4378 }
4379 }
4380
4381 static void renameGenericCommand(redisClient *c, int nx) {
4382 robj *o;
4383
4384 /* To use the same key as src and dst is probably an error */
4385 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4386 addReply(c,shared.sameobjecterr);
4387 return;
4388 }
4389
4390 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4391 return;
4392
4393 incrRefCount(o);
4394 deleteIfVolatile(c->db,c->argv[2]);
4395 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4396 if (nx) {
4397 decrRefCount(o);
4398 addReply(c,shared.czero);
4399 return;
4400 }
4401 dictReplace(c->db->dict,c->argv[2],o);
4402 } else {
4403 incrRefCount(c->argv[2]);
4404 }
4405 deleteKey(c->db,c->argv[1]);
4406 server.dirty++;
4407 addReply(c,nx ? shared.cone : shared.ok);
4408 }
4409
4410 static void renameCommand(redisClient *c) {
4411 renameGenericCommand(c,0);
4412 }
4413
4414 static void renamenxCommand(redisClient *c) {
4415 renameGenericCommand(c,1);
4416 }
4417
4418 static void moveCommand(redisClient *c) {
4419 robj *o;
4420 redisDb *src, *dst;
4421 int srcid;
4422
4423 /* Obtain source and target DB pointers */
4424 src = c->db;
4425 srcid = c->db->id;
4426 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4427 addReply(c,shared.outofrangeerr);
4428 return;
4429 }
4430 dst = c->db;
4431 selectDb(c,srcid); /* Back to the source DB */
4432
4433 /* If the user is moving using as target the same
4434 * DB as the source DB it is probably an error. */
4435 if (src == dst) {
4436 addReply(c,shared.sameobjecterr);
4437 return;
4438 }
4439
4440 /* Check if the element exists and get a reference */
4441 o = lookupKeyWrite(c->db,c->argv[1]);
4442 if (!o) {
4443 addReply(c,shared.czero);
4444 return;
4445 }
4446
4447 /* Try to add the element to the target DB */
4448 deleteIfVolatile(dst,c->argv[1]);
4449 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4450 addReply(c,shared.czero);
4451 return;
4452 }
4453 incrRefCount(c->argv[1]);
4454 incrRefCount(o);
4455
4456 /* OK! key moved, free the entry in the source DB */
4457 deleteKey(src,c->argv[1]);
4458 server.dirty++;
4459 addReply(c,shared.cone);
4460 }
4461
4462 /* =================================== Lists ================================ */
4463 static void pushGenericCommand(redisClient *c, int where) {
4464 robj *lobj;
4465 list *list;
4466
4467 lobj = lookupKeyWrite(c->db,c->argv[1]);
4468 if (lobj == NULL) {
4469 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4470 addReply(c,shared.cone);
4471 return;
4472 }
4473 lobj = createListObject();
4474 list = lobj->ptr;
4475 if (where == REDIS_HEAD) {
4476 listAddNodeHead(list,c->argv[2]);
4477 } else {
4478 listAddNodeTail(list,c->argv[2]);
4479 }
4480 dictAdd(c->db->dict,c->argv[1],lobj);
4481 incrRefCount(c->argv[1]);
4482 incrRefCount(c->argv[2]);
4483 } else {
4484 if (lobj->type != REDIS_LIST) {
4485 addReply(c,shared.wrongtypeerr);
4486 return;
4487 }
4488 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4489 addReply(c,shared.cone);
4490 return;
4491 }
4492 list = lobj->ptr;
4493 if (where == REDIS_HEAD) {
4494 listAddNodeHead(list,c->argv[2]);
4495 } else {
4496 listAddNodeTail(list,c->argv[2]);
4497 }
4498 incrRefCount(c->argv[2]);
4499 }
4500 server.dirty++;
4501 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4502 }
4503
4504 static void lpushCommand(redisClient *c) {
4505 pushGenericCommand(c,REDIS_HEAD);
4506 }
4507
4508 static void rpushCommand(redisClient *c) {
4509 pushGenericCommand(c,REDIS_TAIL);
4510 }
4511
4512 static void llenCommand(redisClient *c) {
4513 robj *o;
4514 list *l;
4515
4516 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4517 checkType(c,o,REDIS_LIST)) return;
4518
4519 l = o->ptr;
4520 addReplyUlong(c,listLength(l));
4521 }
4522
4523 static void lindexCommand(redisClient *c) {
4524 robj *o;
4525 int index = atoi(c->argv[2]->ptr);
4526 list *list;
4527 listNode *ln;
4528
4529 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4530 checkType(c,o,REDIS_LIST)) return;
4531 list = o->ptr;
4532
4533 ln = listIndex(list, index);
4534 if (ln == NULL) {
4535 addReply(c,shared.nullbulk);
4536 } else {
4537 robj *ele = listNodeValue(ln);
4538 addReplyBulk(c,ele);
4539 }
4540 }
4541
4542 static void lsetCommand(redisClient *c) {
4543 robj *o;
4544 int index = atoi(c->argv[2]->ptr);
4545 list *list;
4546 listNode *ln;
4547
4548 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4549 checkType(c,o,REDIS_LIST)) return;
4550 list = o->ptr;
4551
4552 ln = listIndex(list, index);
4553 if (ln == NULL) {
4554 addReply(c,shared.outofrangeerr);
4555 } else {
4556 robj *ele = listNodeValue(ln);
4557
4558 decrRefCount(ele);
4559 listNodeValue(ln) = c->argv[3];
4560 incrRefCount(c->argv[3]);
4561 addReply(c,shared.ok);
4562 server.dirty++;
4563 }
4564 }
4565
4566 static void popGenericCommand(redisClient *c, int where) {
4567 robj *o;
4568 list *list;
4569 listNode *ln;
4570
4571 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4572 checkType(c,o,REDIS_LIST)) return;
4573 list = o->ptr;
4574
4575 if (where == REDIS_HEAD)
4576 ln = listFirst(list);
4577 else
4578 ln = listLast(list);
4579
4580 if (ln == NULL) {
4581 addReply(c,shared.nullbulk);
4582 } else {
4583 robj *ele = listNodeValue(ln);
4584 addReplyBulk(c,ele);
4585 listDelNode(list,ln);
4586 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4587 server.dirty++;
4588 }
4589 }
4590
4591 static void lpopCommand(redisClient *c) {
4592 popGenericCommand(c,REDIS_HEAD);
4593 }
4594
4595 static void rpopCommand(redisClient *c) {
4596 popGenericCommand(c,REDIS_TAIL);
4597 }
4598
4599 static void lrangeCommand(redisClient *c) {
4600 robj *o;
4601 int start = atoi(c->argv[2]->ptr);
4602 int end = atoi(c->argv[3]->ptr);
4603 int llen;
4604 int rangelen, j;
4605 list *list;
4606 listNode *ln;
4607 robj *ele;
4608
4609 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4610 || checkType(c,o,REDIS_LIST)) return;
4611 list = o->ptr;
4612 llen = listLength(list);
4613
4614 /* convert negative indexes */
4615 if (start < 0) start = llen+start;
4616 if (end < 0) end = llen+end;
4617 if (start < 0) start = 0;
4618 if (end < 0) end = 0;
4619
4620 /* indexes sanity checks */
4621 if (start > end || start >= llen) {
4622 /* Out of range start or start > end result in empty list */
4623 addReply(c,shared.emptymultibulk);
4624 return;
4625 }
4626 if (end >= llen) end = llen-1;
4627 rangelen = (end-start)+1;
4628
4629 /* Return the result in form of a multi-bulk reply */
4630 ln = listIndex(list, start);
4631 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4632 for (j = 0; j < rangelen; j++) {
4633 ele = listNodeValue(ln);
4634 addReplyBulk(c,ele);
4635 ln = ln->next;
4636 }
4637 }
4638
4639 static void ltrimCommand(redisClient *c) {
4640 robj *o;
4641 int start = atoi(c->argv[2]->ptr);
4642 int end = atoi(c->argv[3]->ptr);
4643 int llen;
4644 int j, ltrim, rtrim;
4645 list *list;
4646 listNode *ln;
4647
4648 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4649 checkType(c,o,REDIS_LIST)) return;
4650 list = o->ptr;
4651 llen = listLength(list);
4652
4653 /* convert negative indexes */
4654 if (start < 0) start = llen+start;
4655 if (end < 0) end = llen+end;
4656 if (start < 0) start = 0;
4657 if (end < 0) end = 0;
4658
4659 /* indexes sanity checks */
4660 if (start > end || start >= llen) {
4661 /* Out of range start or start > end result in empty list */
4662 ltrim = llen;
4663 rtrim = 0;
4664 } else {
4665 if (end >= llen) end = llen-1;
4666 ltrim = start;
4667 rtrim = llen-end-1;
4668 }
4669
4670 /* Remove list elements to perform the trim */
4671 for (j = 0; j < ltrim; j++) {
4672 ln = listFirst(list);
4673 listDelNode(list,ln);
4674 }
4675 for (j = 0; j < rtrim; j++) {
4676 ln = listLast(list);
4677 listDelNode(list,ln);
4678 }
4679 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4680 server.dirty++;
4681 addReply(c,shared.ok);
4682 }
4683
4684 static void lremCommand(redisClient *c) {
4685 robj *o;
4686 list *list;
4687 listNode *ln, *next;
4688 int toremove = atoi(c->argv[2]->ptr);
4689 int removed = 0;
4690 int fromtail = 0;
4691
4692 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4693 checkType(c,o,REDIS_LIST)) return;
4694 list = o->ptr;
4695
4696 if (toremove < 0) {
4697 toremove = -toremove;
4698 fromtail = 1;
4699 }
4700 ln = fromtail ? list->tail : list->head;
4701 while (ln) {
4702 robj *ele = listNodeValue(ln);
4703
4704 next = fromtail ? ln->prev : ln->next;
4705 if (compareStringObjects(ele,c->argv[3]) == 0) {
4706 listDelNode(list,ln);
4707 server.dirty++;
4708 removed++;
4709 if (toremove && removed == toremove) break;
4710 }
4711 ln = next;
4712 }
4713 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4714 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4715 }
4716
4717 /* This is the semantic of this command:
4718 * RPOPLPUSH srclist dstlist:
4719 * IF LLEN(srclist) > 0
4720 * element = RPOP srclist
4721 * LPUSH dstlist element
4722 * RETURN element
4723 * ELSE
4724 * RETURN nil
4725 * END
4726 * END
4727 *
4728 * The idea is to be able to get an element from a list in a reliable way
4729 * since the element is not just returned but pushed against another list
4730 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4731 */
4732 static void rpoplpushcommand(redisClient *c) {
4733 robj *sobj;
4734 list *srclist;
4735 listNode *ln;
4736
4737 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4738 checkType(c,sobj,REDIS_LIST)) return;
4739 srclist = sobj->ptr;
4740 ln = listLast(srclist);
4741
4742 if (ln == NULL) {
4743 addReply(c,shared.nullbulk);
4744 } else {
4745 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4746 robj *ele = listNodeValue(ln);
4747 list *dstlist;
4748
4749 if (dobj && dobj->type != REDIS_LIST) {
4750 addReply(c,shared.wrongtypeerr);
4751 return;
4752 }
4753
4754 /* Add the element to the target list (unless it's directly
4755 * passed to some BLPOP-ing client */
4756 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4757 if (dobj == NULL) {
4758 /* Create the list if the key does not exist */
4759 dobj = createListObject();
4760 dictAdd(c->db->dict,c->argv[2],dobj);
4761 incrRefCount(c->argv[2]);
4762 }
4763 dstlist = dobj->ptr;
4764 listAddNodeHead(dstlist,ele);
4765 incrRefCount(ele);
4766 }
4767
4768 /* Send the element to the client as reply as well */
4769 addReplyBulk(c,ele);
4770
4771 /* Finally remove the element from the source list */
4772 listDelNode(srclist,ln);
4773 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4774 server.dirty++;
4775 }
4776 }
4777
4778 /* ==================================== Sets ================================ */
4779
4780 static void saddCommand(redisClient *c) {
4781 robj *set;
4782
4783 set = lookupKeyWrite(c->db,c->argv[1]);
4784 if (set == NULL) {
4785 set = createSetObject();
4786 dictAdd(c->db->dict,c->argv[1],set);
4787 incrRefCount(c->argv[1]);
4788 } else {
4789 if (set->type != REDIS_SET) {
4790 addReply(c,shared.wrongtypeerr);
4791 return;
4792 }
4793 }
4794 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4795 incrRefCount(c->argv[2]);
4796 server.dirty++;
4797 addReply(c,shared.cone);
4798 } else {
4799 addReply(c,shared.czero);
4800 }
4801 }
4802
4803 static void sremCommand(redisClient *c) {
4804 robj *set;
4805
4806 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4807 checkType(c,set,REDIS_SET)) return;
4808
4809 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4810 server.dirty++;
4811 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4812 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4813 addReply(c,shared.cone);
4814 } else {
4815 addReply(c,shared.czero);
4816 }
4817 }
4818
4819 static void smoveCommand(redisClient *c) {
4820 robj *srcset, *dstset;
4821
4822 srcset = lookupKeyWrite(c->db,c->argv[1]);
4823 dstset = lookupKeyWrite(c->db,c->argv[2]);
4824
4825 /* If the source key does not exist return 0, if it's of the wrong type
4826 * raise an error */
4827 if (srcset == NULL || srcset->type != REDIS_SET) {
4828 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4829 return;
4830 }
4831 /* Error if the destination key is not a set as well */
4832 if (dstset && dstset->type != REDIS_SET) {
4833 addReply(c,shared.wrongtypeerr);
4834 return;
4835 }
4836 /* Remove the element from the source set */
4837 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4838 /* Key not found in the src set! return zero */
4839 addReply(c,shared.czero);
4840 return;
4841 }
4842 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4843 deleteKey(c->db,c->argv[1]);
4844 server.dirty++;
4845 /* Add the element to the destination set */
4846 if (!dstset) {
4847 dstset = createSetObject();
4848 dictAdd(c->db->dict,c->argv[2],dstset);
4849 incrRefCount(c->argv[2]);
4850 }
4851 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4852 incrRefCount(c->argv[3]);
4853 addReply(c,shared.cone);
4854 }
4855
4856 static void sismemberCommand(redisClient *c) {
4857 robj *set;
4858
4859 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4860 checkType(c,set,REDIS_SET)) return;
4861
4862 if (dictFind(set->ptr,c->argv[2]))
4863 addReply(c,shared.cone);
4864 else
4865 addReply(c,shared.czero);
4866 }
4867
4868 static void scardCommand(redisClient *c) {
4869 robj *o;
4870 dict *s;
4871
4872 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4873 checkType(c,o,REDIS_SET)) return;
4874
4875 s = o->ptr;
4876 addReplyUlong(c,dictSize(s));
4877 }
4878
4879 static void spopCommand(redisClient *c) {
4880 robj *set;
4881 dictEntry *de;
4882
4883 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4884 checkType(c,set,REDIS_SET)) return;
4885
4886 de = dictGetRandomKey(set->ptr);
4887 if (de == NULL) {
4888 addReply(c,shared.nullbulk);
4889 } else {
4890 robj *ele = dictGetEntryKey(de);
4891
4892 addReplyBulk(c,ele);
4893 dictDelete(set->ptr,ele);
4894 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4895 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4896 server.dirty++;
4897 }
4898 }
4899
4900 static void srandmemberCommand(redisClient *c) {
4901 robj *set;
4902 dictEntry *de;
4903
4904 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4905 checkType(c,set,REDIS_SET)) return;
4906
4907 de = dictGetRandomKey(set->ptr);
4908 if (de == NULL) {
4909 addReply(c,shared.nullbulk);
4910 } else {
4911 robj *ele = dictGetEntryKey(de);
4912
4913 addReplyBulk(c,ele);
4914 }
4915 }
4916
4917 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4918 dict **d1 = (void*) s1, **d2 = (void*) s2;
4919
4920 return dictSize(*d1)-dictSize(*d2);
4921 }
4922
4923 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4924 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4925 dictIterator *di;
4926 dictEntry *de;
4927 robj *lenobj = NULL, *dstset = NULL;
4928 unsigned long j, cardinality = 0;
4929
4930 for (j = 0; j < setsnum; j++) {
4931 robj *setobj;
4932
4933 setobj = dstkey ?
4934 lookupKeyWrite(c->db,setskeys[j]) :
4935 lookupKeyRead(c->db,setskeys[j]);
4936 if (!setobj) {
4937 zfree(dv);
4938 if (dstkey) {
4939 if (deleteKey(c->db,dstkey))
4940 server.dirty++;
4941 addReply(c,shared.czero);
4942 } else {
4943 addReply(c,shared.emptymultibulk);
4944 }
4945 return;
4946 }
4947 if (setobj->type != REDIS_SET) {
4948 zfree(dv);
4949 addReply(c,shared.wrongtypeerr);
4950 return;
4951 }
4952 dv[j] = setobj->ptr;
4953 }
4954 /* Sort sets from the smallest to largest, this will improve our
4955 * algorithm's performace */
4956 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4957
4958 /* The first thing we should output is the total number of elements...
4959 * since this is a multi-bulk write, but at this stage we don't know
4960 * the intersection set size, so we use a trick, append an empty object
4961 * to the output list and save the pointer to later modify it with the
4962 * right length */
4963 if (!dstkey) {
4964 lenobj = createObject(REDIS_STRING,NULL);
4965 addReply(c,lenobj);
4966 decrRefCount(lenobj);
4967 } else {
4968 /* If we have a target key where to store the resulting set
4969 * create this key with an empty set inside */
4970 dstset = createSetObject();
4971 }
4972
4973 /* Iterate all the elements of the first (smallest) set, and test
4974 * the element against all the other sets, if at least one set does
4975 * not include the element it is discarded */
4976 di = dictGetIterator(dv[0]);
4977
4978 while((de = dictNext(di)) != NULL) {
4979 robj *ele;
4980
4981 for (j = 1; j < setsnum; j++)
4982 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4983 if (j != setsnum)
4984 continue; /* at least one set does not contain the member */
4985 ele = dictGetEntryKey(de);
4986 if (!dstkey) {
4987 addReplyBulk(c,ele);
4988 cardinality++;
4989 } else {
4990 dictAdd(dstset->ptr,ele,NULL);
4991 incrRefCount(ele);
4992 }
4993 }
4994 dictReleaseIterator(di);
4995
4996 if (dstkey) {
4997 /* Store the resulting set into the target, if the intersection
4998 * is not an empty set. */
4999 deleteKey(c->db,dstkey);
5000 if (dictSize((dict*)dstset->ptr) > 0) {
5001 dictAdd(c->db->dict,dstkey,dstset);
5002 incrRefCount(dstkey);
5003 addReplyLong(c,dictSize((dict*)dstset->ptr));
5004 } else {
5005 decrRefCount(dstset);
5006 addReply(c,shared.czero);
5007 }
5008 server.dirty++;
5009 } else {
5010 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
5011 }
5012 zfree(dv);
5013 }
5014
5015 static void sinterCommand(redisClient *c) {
5016 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5017 }
5018
5019 static void sinterstoreCommand(redisClient *c) {
5020 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5021 }
5022
5023 #define REDIS_OP_UNION 0
5024 #define REDIS_OP_DIFF 1
5025 #define REDIS_OP_INTER 2
5026
5027 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
5028 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5029 dictIterator *di;
5030 dictEntry *de;
5031 robj *dstset = NULL;
5032 int j, cardinality = 0;
5033
5034 for (j = 0; j < setsnum; j++) {
5035 robj *setobj;
5036
5037 setobj = dstkey ?
5038 lookupKeyWrite(c->db,setskeys[j]) :
5039 lookupKeyRead(c->db,setskeys[j]);
5040 if (!setobj) {
5041 dv[j] = NULL;
5042 continue;
5043 }
5044 if (setobj->type != REDIS_SET) {
5045 zfree(dv);
5046 addReply(c,shared.wrongtypeerr);
5047 return;
5048 }
5049 dv[j] = setobj->ptr;
5050 }
5051
5052 /* We need a temp set object to store our union. If the dstkey
5053 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5054 * this set object will be the resulting object to set into the target key*/
5055 dstset = createSetObject();
5056
5057 /* Iterate all the elements of all the sets, add every element a single
5058 * time to the result set */
5059 for (j = 0; j < setsnum; j++) {
5060 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
5061 if (!dv[j]) continue; /* non existing keys are like empty sets */
5062
5063 di = dictGetIterator(dv[j]);
5064
5065 while((de = dictNext(di)) != NULL) {
5066 robj *ele;
5067
5068 /* dictAdd will not add the same element multiple times */
5069 ele = dictGetEntryKey(de);
5070 if (op == REDIS_OP_UNION || j == 0) {
5071 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5072 incrRefCount(ele);
5073 cardinality++;
5074 }
5075 } else if (op == REDIS_OP_DIFF) {
5076 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5077 cardinality--;
5078 }
5079 }
5080 }
5081 dictReleaseIterator(di);
5082
5083 /* result set is empty? Exit asap. */
5084 if (op == REDIS_OP_DIFF && cardinality == 0) break;
5085 }
5086
5087 /* Output the content of the resulting set, if not in STORE mode */
5088 if (!dstkey) {
5089 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5090 di = dictGetIterator(dstset->ptr);
5091 while((de = dictNext(di)) != NULL) {
5092 robj *ele;
5093
5094 ele = dictGetEntryKey(de);
5095 addReplyBulk(c,ele);
5096 }
5097 dictReleaseIterator(di);
5098 decrRefCount(dstset);
5099 } else {
5100 /* If we have a target key where to store the resulting set
5101 * create this key with the result set inside */
5102 deleteKey(c->db,dstkey);
5103 if (dictSize((dict*)dstset->ptr) > 0) {
5104 dictAdd(c->db->dict,dstkey,dstset);
5105 incrRefCount(dstkey);
5106 addReplyLong(c,dictSize((dict*)dstset->ptr));
5107 } else {
5108 decrRefCount(dstset);
5109 addReply(c,shared.czero);
5110 }
5111 server.dirty++;
5112 }
5113 zfree(dv);
5114 }
5115
5116 static void sunionCommand(redisClient *c) {
5117 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5118 }
5119
5120 static void sunionstoreCommand(redisClient *c) {
5121 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5122 }
5123
5124 static void sdiffCommand(redisClient *c) {
5125 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5126 }
5127
5128 static void sdiffstoreCommand(redisClient *c) {
5129 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5130 }
5131
5132 /* ==================================== ZSets =============================== */
5133
5134 /* ZSETs are ordered sets using two data structures to hold the same elements
5135 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5136 * data structure.
5137 *
5138 * The elements are added to an hash table mapping Redis objects to scores.
5139 * At the same time the elements are added to a skip list mapping scores
5140 * to Redis objects (so objects are sorted by scores in this "view"). */
5141
5142 /* This skiplist implementation is almost a C translation of the original
5143 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5144 * Alternative to Balanced Trees", modified in three ways:
5145 * a) this implementation allows for repeated values.
5146 * b) the comparison is not just by key (our 'score') but by satellite data.
5147 * c) there is a back pointer, so it's a doubly linked list with the back
5148 * pointers being only at "level 1". This allows to traverse the list
5149 * from tail to head, useful for ZREVRANGE. */
5150
5151 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5152 zskiplistNode *zn = zmalloc(sizeof(*zn));
5153
5154 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5155 if (level > 0)
5156 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5157 zn->score = score;
5158 zn->obj = obj;
5159 return zn;
5160 }
5161
5162 static zskiplist *zslCreate(void) {
5163 int j;
5164 zskiplist *zsl;
5165
5166 zsl = zmalloc(sizeof(*zsl));
5167 zsl->level = 1;
5168 zsl->length = 0;
5169 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5170 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5171 zsl->header->forward[j] = NULL;
5172
5173 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5174 if (j < ZSKIPLIST_MAXLEVEL-1)
5175 zsl->header->span[j] = 0;
5176 }
5177 zsl->header->backward = NULL;
5178 zsl->tail = NULL;
5179 return zsl;
5180 }
5181
5182 static void zslFreeNode(zskiplistNode *node) {
5183 decrRefCount(node->obj);
5184 zfree(node->forward);
5185 zfree(node->span);
5186 zfree(node);
5187 }
5188
5189 static void zslFree(zskiplist *zsl) {
5190 zskiplistNode *node = zsl->header->forward[0], *next;
5191
5192 zfree(zsl->header->forward);
5193 zfree(zsl->header->span);
5194 zfree(zsl->header);
5195 while(node) {
5196 next = node->forward[0];
5197 zslFreeNode(node);
5198 node = next;
5199 }
5200 zfree(zsl);
5201 }
5202
5203 static int zslRandomLevel(void) {
5204 int level = 1;
5205 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5206 level += 1;
5207 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5208 }
5209
5210 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5211 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5212 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5213 int i, level;
5214
5215 x = zsl->header;
5216 for (i = zsl->level-1; i >= 0; i--) {
5217 /* store rank that is crossed to reach the insert position */
5218 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5219
5220 while (x->forward[i] &&
5221 (x->forward[i]->score < score ||
5222 (x->forward[i]->score == score &&
5223 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5224 rank[i] += i > 0 ? x->span[i-1] : 1;
5225 x = x->forward[i];
5226 }
5227 update[i] = x;
5228 }
5229 /* we assume the key is not already inside, since we allow duplicated
5230 * scores, and the re-insertion of score and redis object should never
5231 * happpen since the caller of zslInsert() should test in the hash table
5232 * if the element is already inside or not. */
5233 level = zslRandomLevel();
5234 if (level > zsl->level) {
5235 for (i = zsl->level; i < level; i++) {
5236 rank[i] = 0;
5237 update[i] = zsl->header;
5238 update[i]->span[i-1] = zsl->length;
5239 }
5240 zsl->level = level;
5241 }
5242 x = zslCreateNode(level,score,obj);
5243 for (i = 0; i < level; i++) {
5244 x->forward[i] = update[i]->forward[i];
5245 update[i]->forward[i] = x;
5246
5247 /* update span covered by update[i] as x is inserted here */
5248 if (i > 0) {
5249 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5250 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5251 }
5252 }
5253
5254 /* increment span for untouched levels */
5255 for (i = level; i < zsl->level; i++) {
5256 update[i]->span[i-1]++;
5257 }
5258
5259 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5260 if (x->forward[0])
5261 x->forward[0]->backward = x;
5262 else
5263 zsl->tail = x;
5264 zsl->length++;
5265 }
5266
5267 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5268 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5269 int i;
5270 for (i = 0; i < zsl->level; i++) {
5271 if (update[i]->forward[i] == x) {
5272 if (i > 0) {
5273 update[i]->span[i-1] += x->span[i-1] - 1;
5274 }
5275 update[i]->forward[i] = x->forward[i];
5276 } else {
5277 /* invariant: i > 0, because update[0]->forward[0]
5278 * is always equal to x */
5279 update[i]->span[i-1] -= 1;
5280 }
5281 }
5282 if (x->forward[0]) {
5283 x->forward[0]->backward = x->backward;
5284 } else {
5285 zsl->tail = x->backward;
5286 }
5287 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5288 zsl->level--;
5289 zsl->length--;
5290 }
5291
5292 /* Delete an element with matching score/object from the skiplist. */
5293 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5294 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5295 int i;
5296
5297 x = zsl->header;
5298 for (i = zsl->level-1; i >= 0; i--) {
5299 while (x->forward[i] &&
5300 (x->forward[i]->score < score ||
5301 (x->forward[i]->score == score &&
5302 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5303 x = x->forward[i];
5304 update[i] = x;
5305 }
5306 /* We may have multiple elements with the same score, what we need
5307 * is to find the element with both the right score and object. */
5308 x = x->forward[0];
5309 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5310 zslDeleteNode(zsl, x, update);
5311 zslFreeNode(x);
5312 return 1;
5313 } else {
5314 return 0; /* not found */
5315 }
5316 return 0; /* not found */
5317 }
5318
5319 /* Delete all the elements with score between min and max from the skiplist.
5320 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5321 * Note that this function takes the reference to the hash table view of the
5322 * sorted set, in order to remove the elements from the hash table too. */
5323 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5324 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5325 unsigned long removed = 0;
5326 int i;
5327
5328 x = zsl->header;
5329 for (i = zsl->level-1; i >= 0; i--) {
5330 while (x->forward[i] && x->forward[i]->score < min)
5331 x = x->forward[i];
5332 update[i] = x;
5333 }
5334 /* We may have multiple elements with the same score, what we need
5335 * is to find the element with both the right score and object. */
5336 x = x->forward[0];
5337 while (x && x->score <= max) {
5338 zskiplistNode *next = x->forward[0];
5339 zslDeleteNode(zsl, x, update);
5340 dictDelete(dict,x->obj);
5341 zslFreeNode(x);
5342 removed++;
5343 x = next;
5344 }
5345 return removed; /* not found */
5346 }
5347
5348 /* Delete all the elements with rank between start and end from the skiplist.
5349 * Start and end are inclusive. Note that start and end need to be 1-based */
5350 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5351 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5352 unsigned long traversed = 0, removed = 0;
5353 int i;
5354
5355 x = zsl->header;
5356 for (i = zsl->level-1; i >= 0; i--) {
5357 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5358 traversed += i > 0 ? x->span[i-1] : 1;
5359 x = x->forward[i];
5360 }
5361 update[i] = x;
5362 }
5363
5364 traversed++;
5365 x = x->forward[0];
5366 while (x && traversed <= end) {
5367 zskiplistNode *next = x->forward[0];
5368 zslDeleteNode(zsl, x, update);
5369 dictDelete(dict,x->obj);
5370 zslFreeNode(x);
5371 removed++;
5372 traversed++;
5373 x = next;
5374 }
5375 return removed;
5376 }
5377
5378 /* Find the first node having a score equal or greater than the specified one.
5379 * Returns NULL if there is no match. */
5380 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5381 zskiplistNode *x;
5382 int i;
5383
5384 x = zsl->header;
5385 for (i = zsl->level-1; i >= 0; i--) {
5386 while (x->forward[i] && x->forward[i]->score < score)
5387 x = x->forward[i];
5388 }
5389 /* We may have multiple elements with the same score, what we need
5390 * is to find the element with both the right score and object. */
5391 return x->forward[0];
5392 }
5393
5394 /* Find the rank for an element by both score and key.
5395 * Returns 0 when the element cannot be found, rank otherwise.
5396 * Note that the rank is 1-based due to the span of zsl->header to the
5397 * first element. */
5398 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5399 zskiplistNode *x;
5400 unsigned long rank = 0;
5401 int i;
5402
5403 x = zsl->header;
5404 for (i = zsl->level-1; i >= 0; i--) {
5405 while (x->forward[i] &&
5406 (x->forward[i]->score < score ||
5407 (x->forward[i]->score == score &&
5408 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5409 rank += i > 0 ? x->span[i-1] : 1;
5410 x = x->forward[i];
5411 }
5412
5413 /* x might be equal to zsl->header, so test if obj is non-NULL */
5414 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5415 return rank;
5416 }
5417 }
5418 return 0;
5419 }
5420
5421 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5422 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5423 zskiplistNode *x;
5424 unsigned long traversed = 0;
5425 int i;
5426
5427 x = zsl->header;
5428 for (i = zsl->level-1; i >= 0; i--) {
5429 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5430 {
5431 traversed += i > 0 ? x->span[i-1] : 1;
5432 x = x->forward[i];
5433 }
5434 if (traversed == rank) {
5435 return x;
5436 }
5437 }
5438 return NULL;
5439 }
5440
5441 /* The actual Z-commands implementations */
5442
5443 /* This generic command implements both ZADD and ZINCRBY.
5444 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5445 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5446 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5447 robj *zsetobj;
5448 zset *zs;
5449 double *score;
5450
5451 zsetobj = lookupKeyWrite(c->db,key);
5452 if (zsetobj == NULL) {
5453 zsetobj = createZsetObject();
5454 dictAdd(c->db->dict,key,zsetobj);
5455 incrRefCount(key);
5456 } else {
5457 if (zsetobj->type != REDIS_ZSET) {
5458 addReply(c,shared.wrongtypeerr);
5459 return;
5460 }
5461 }
5462 zs = zsetobj->ptr;
5463
5464 /* Ok now since we implement both ZADD and ZINCRBY here the code
5465 * needs to handle the two different conditions. It's all about setting
5466 * '*score', that is, the new score to set, to the right value. */
5467 score = zmalloc(sizeof(double));
5468 if (doincrement) {
5469 dictEntry *de;
5470
5471 /* Read the old score. If the element was not present starts from 0 */
5472 de = dictFind(zs->dict,ele);
5473 if (de) {
5474 double *oldscore = dictGetEntryVal(de);
5475 *score = *oldscore + scoreval;
5476 } else {
5477 *score = scoreval;
5478 }
5479 } else {
5480 *score = scoreval;
5481 }
5482
5483 /* What follows is a simple remove and re-insert operation that is common
5484 * to both ZADD and ZINCRBY... */
5485 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5486 /* case 1: New element */
5487 incrRefCount(ele); /* added to hash */
5488 zslInsert(zs->zsl,*score,ele);
5489 incrRefCount(ele); /* added to skiplist */
5490 server.dirty++;
5491 if (doincrement)
5492 addReplyDouble(c,*score);
5493 else
5494 addReply(c,shared.cone);
5495 } else {
5496 dictEntry *de;
5497 double *oldscore;
5498
5499 /* case 2: Score update operation */
5500 de = dictFind(zs->dict,ele);
5501 redisAssert(de != NULL);
5502 oldscore = dictGetEntryVal(de);
5503 if (*score != *oldscore) {
5504 int deleted;
5505
5506 /* Remove and insert the element in the skip list with new score */
5507 deleted = zslDelete(zs->zsl,*oldscore,ele);
5508 redisAssert(deleted != 0);
5509 zslInsert(zs->zsl,*score,ele);
5510 incrRefCount(ele);
5511 /* Update the score in the hash table */
5512 dictReplace(zs->dict,ele,score);
5513 server.dirty++;
5514 } else {
5515 zfree(score);
5516 }
5517 if (doincrement)
5518 addReplyDouble(c,*score);
5519 else
5520 addReply(c,shared.czero);
5521 }
5522 }
5523
5524 static void zaddCommand(redisClient *c) {
5525 double scoreval;
5526
5527 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5528 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5529 }
5530
5531 static void zincrbyCommand(redisClient *c) {
5532 double scoreval;
5533
5534 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
5535 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5536 }
5537
5538 static void zremCommand(redisClient *c) {
5539 robj *zsetobj;
5540 zset *zs;
5541 dictEntry *de;
5542 double *oldscore;
5543 int deleted;
5544
5545 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5546 checkType(c,zsetobj,REDIS_ZSET)) return;
5547
5548 zs = zsetobj->ptr;
5549 de = dictFind(zs->dict,c->argv[2]);
5550 if (de == NULL) {
5551 addReply(c,shared.czero);
5552 return;
5553 }
5554 /* Delete from the skiplist */
5555 oldscore = dictGetEntryVal(de);
5556 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5557 redisAssert(deleted != 0);
5558
5559 /* Delete from the hash table */
5560 dictDelete(zs->dict,c->argv[2]);
5561 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5562 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5563 server.dirty++;
5564 addReply(c,shared.cone);
5565 }
5566
5567 static void zremrangebyscoreCommand(redisClient *c) {
5568 double min;
5569 double max;
5570 long deleted;
5571 robj *zsetobj;
5572 zset *zs;
5573
5574 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5575 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
5576
5577 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5578 checkType(c,zsetobj,REDIS_ZSET)) return;
5579
5580 zs = zsetobj->ptr;
5581 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5582 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5583 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5584 server.dirty += deleted;
5585 addReplyLong(c,deleted);
5586 }
5587
5588 static void zremrangebyrankCommand(redisClient *c) {
5589 long start;
5590 long end;
5591 int llen;
5592 long deleted;
5593 robj *zsetobj;
5594 zset *zs;
5595
5596 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5597 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5598
5599 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5600 checkType(c,zsetobj,REDIS_ZSET)) return;
5601 zs = zsetobj->ptr;
5602 llen = zs->zsl->length;
5603
5604 /* convert negative indexes */
5605 if (start < 0) start = llen+start;
5606 if (end < 0) end = llen+end;
5607 if (start < 0) start = 0;
5608 if (end < 0) end = 0;
5609
5610 /* indexes sanity checks */
5611 if (start > end || start >= llen) {
5612 addReply(c,shared.czero);
5613 return;
5614 }
5615 if (end >= llen) end = llen-1;
5616
5617 /* increment start and end because zsl*Rank functions
5618 * use 1-based rank */
5619 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5620 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5621 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5622 server.dirty += deleted;
5623 addReplyLong(c, deleted);
5624 }
5625
5626 typedef struct {
5627 dict *dict;
5628 double weight;
5629 } zsetopsrc;
5630
5631 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5632 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5633 unsigned long size1, size2;
5634 size1 = d1->dict ? dictSize(d1->dict) : 0;
5635 size2 = d2->dict ? dictSize(d2->dict) : 0;
5636 return size1 - size2;
5637 }
5638
5639 #define REDIS_AGGR_SUM 1
5640 #define REDIS_AGGR_MIN 2
5641 #define REDIS_AGGR_MAX 3
5642
5643 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5644 if (aggregate == REDIS_AGGR_SUM) {
5645 *target = *target + val;
5646 } else if (aggregate == REDIS_AGGR_MIN) {
5647 *target = val < *target ? val : *target;
5648 } else if (aggregate == REDIS_AGGR_MAX) {
5649 *target = val > *target ? val : *target;
5650 } else {
5651 /* safety net */
5652 redisPanic("Unknown ZUNION/INTER aggregate type");
5653 }
5654 }
5655
5656 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5657 int i, j, zsetnum;
5658 int aggregate = REDIS_AGGR_SUM;
5659 zsetopsrc *src;
5660 robj *dstobj;
5661 zset *dstzset;
5662 dictIterator *di;
5663 dictEntry *de;
5664
5665 /* expect zsetnum input keys to be given */
5666 zsetnum = atoi(c->argv[2]->ptr);
5667 if (zsetnum < 1) {
5668 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5669 return;
5670 }
5671
5672 /* test if the expected number of keys would overflow */
5673 if (3+zsetnum > c->argc) {
5674 addReply(c,shared.syntaxerr);
5675 return;
5676 }
5677
5678 /* read keys to be used for input */
5679 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5680 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5681 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5682 if (!zsetobj) {
5683 src[i].dict = NULL;
5684 } else {
5685 if (zsetobj->type != REDIS_ZSET) {
5686 zfree(src);
5687 addReply(c,shared.wrongtypeerr);
5688 return;
5689 }
5690 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5691 }
5692
5693 /* default all weights to 1 */
5694 src[i].weight = 1.0;
5695 }
5696
5697 /* parse optional extra arguments */
5698 if (j < c->argc) {
5699 int remaining = c->argc - j;
5700
5701 while (remaining) {
5702 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5703 j++; remaining--;
5704 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5705 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
5706 return;
5707 }
5708 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5709 j++; remaining--;
5710 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5711 aggregate = REDIS_AGGR_SUM;
5712 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5713 aggregate = REDIS_AGGR_MIN;
5714 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5715 aggregate = REDIS_AGGR_MAX;
5716 } else {
5717 zfree(src);
5718 addReply(c,shared.syntaxerr);
5719 return;
5720 }
5721 j++; remaining--;
5722 } else {
5723 zfree(src);
5724 addReply(c,shared.syntaxerr);
5725 return;
5726 }
5727 }
5728 }
5729
5730 /* sort sets from the smallest to largest, this will improve our
5731 * algorithm's performance */
5732 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5733
5734 dstobj = createZsetObject();
5735 dstzset = dstobj->ptr;
5736
5737 if (op == REDIS_OP_INTER) {
5738 /* skip going over all entries if the smallest zset is NULL or empty */
5739 if (src[0].dict && dictSize(src[0].dict) > 0) {
5740 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5741 * from small to large, all src[i > 0].dict are non-empty too */
5742 di = dictGetIterator(src[0].dict);
5743 while((de = dictNext(di)) != NULL) {
5744 double *score = zmalloc(sizeof(double)), value;
5745 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5746
5747 for (j = 1; j < zsetnum; j++) {
5748 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5749 if (other) {
5750 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5751 zunionInterAggregate(score, value, aggregate);
5752 } else {
5753 break;
5754 }
5755 }
5756
5757 /* skip entry when not present in every source dict */
5758 if (j != zsetnum) {
5759 zfree(score);
5760 } else {
5761 robj *o = dictGetEntryKey(de);
5762 dictAdd(dstzset->dict,o,score);
5763 incrRefCount(o); /* added to dictionary */
5764 zslInsert(dstzset->zsl,*score,o);
5765 incrRefCount(o); /* added to skiplist */
5766 }
5767 }
5768 dictReleaseIterator(di);
5769 }
5770 } else if (op == REDIS_OP_UNION) {
5771 for (i = 0; i < zsetnum; i++) {
5772 if (!src[i].dict) continue;
5773
5774 di = dictGetIterator(src[i].dict);
5775 while((de = dictNext(di)) != NULL) {
5776 /* skip key when already processed */
5777 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5778
5779 double *score = zmalloc(sizeof(double)), value;
5780 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5781
5782 /* because the zsets are sorted by size, its only possible
5783 * for sets at larger indices to hold this entry */
5784 for (j = (i+1); j < zsetnum; j++) {
5785 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5786 if (other) {
5787 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5788 zunionInterAggregate(score, value, aggregate);
5789 }
5790 }
5791
5792 robj *o = dictGetEntryKey(de);
5793 dictAdd(dstzset->dict,o,score);
5794 incrRefCount(o); /* added to dictionary */
5795 zslInsert(dstzset->zsl,*score,o);
5796 incrRefCount(o); /* added to skiplist */
5797 }
5798 dictReleaseIterator(di);
5799 }
5800 } else {
5801 /* unknown operator */
5802 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5803 }
5804
5805 deleteKey(c->db,dstkey);
5806 if (dstzset->zsl->length) {
5807 dictAdd(c->db->dict,dstkey,dstobj);
5808 incrRefCount(dstkey);
5809 addReplyLong(c, dstzset->zsl->length);
5810 server.dirty++;
5811 } else {
5812 decrRefCount(dstobj);
5813 addReply(c, shared.czero);
5814 }
5815 zfree(src);
5816 }
5817
5818 static void zunionCommand(redisClient *c) {
5819 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5820 }
5821
5822 static void zinterCommand(redisClient *c) {
5823 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5824 }
5825
5826 static void zrangeGenericCommand(redisClient *c, int reverse) {
5827 robj *o;
5828 long start;
5829 long end;
5830 int withscores = 0;
5831 int llen;
5832 int rangelen, j;
5833 zset *zsetobj;
5834 zskiplist *zsl;
5835 zskiplistNode *ln;
5836 robj *ele;
5837
5838 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5839 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
5840
5841 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5842 withscores = 1;
5843 } else if (c->argc >= 5) {
5844 addReply(c,shared.syntaxerr);
5845 return;
5846 }
5847
5848 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5849 || checkType(c,o,REDIS_ZSET)) return;
5850 zsetobj = o->ptr;
5851 zsl = zsetobj->zsl;
5852 llen = zsl->length;
5853
5854 /* convert negative indexes */
5855 if (start < 0) start = llen+start;
5856 if (end < 0) end = llen+end;
5857 if (start < 0) start = 0;
5858 if (end < 0) end = 0;
5859
5860 /* indexes sanity checks */
5861 if (start > end || start >= llen) {
5862 /* Out of range start or start > end result in empty list */
5863 addReply(c,shared.emptymultibulk);
5864 return;
5865 }
5866 if (end >= llen) end = llen-1;
5867 rangelen = (end-start)+1;
5868
5869 /* check if starting point is trivial, before searching
5870 * the element in log(N) time */
5871 if (reverse) {
5872 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5873 } else {
5874 ln = start == 0 ?
5875 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5876 }
5877
5878 /* Return the result in form of a multi-bulk reply */
5879 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5880 withscores ? (rangelen*2) : rangelen));
5881 for (j = 0; j < rangelen; j++) {
5882 ele = ln->obj;
5883 addReplyBulk(c,ele);
5884 if (withscores)
5885 addReplyDouble(c,ln->score);
5886 ln = reverse ? ln->backward : ln->forward[0];
5887 }
5888 }
5889
5890 static void zrangeCommand(redisClient *c) {
5891 zrangeGenericCommand(c,0);
5892 }
5893
5894 static void zrevrangeCommand(redisClient *c) {
5895 zrangeGenericCommand(c,1);
5896 }
5897
5898 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5899 * If justcount is non-zero, just the count is returned. */
5900 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5901 robj *o;
5902 double min, max;
5903 int minex = 0, maxex = 0; /* are min or max exclusive? */
5904 int offset = 0, limit = -1;
5905 int withscores = 0;
5906 int badsyntax = 0;
5907
5908 /* Parse the min-max interval. If one of the values is prefixed
5909 * by the "(" character, it's considered "open". For instance
5910 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5911 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5912 if (((char*)c->argv[2]->ptr)[0] == '(') {
5913 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5914 minex = 1;
5915 } else {
5916 min = strtod(c->argv[2]->ptr,NULL);
5917 }
5918 if (((char*)c->argv[3]->ptr)[0] == '(') {
5919 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5920 maxex = 1;
5921 } else {
5922 max = strtod(c->argv[3]->ptr,NULL);
5923 }
5924
5925 /* Parse "WITHSCORES": note that if the command was called with
5926 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5927 * enter the following paths to parse WITHSCORES and LIMIT. */
5928 if (c->argc == 5 || c->argc == 8) {
5929 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5930 withscores = 1;
5931 else
5932 badsyntax = 1;
5933 }
5934 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5935 badsyntax = 1;
5936 if (badsyntax) {
5937 addReplySds(c,
5938 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5939 return;
5940 }
5941
5942 /* Parse "LIMIT" */
5943 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5944 addReply(c,shared.syntaxerr);
5945 return;
5946 } else if (c->argc == (7 + withscores)) {
5947 offset = atoi(c->argv[5]->ptr);
5948 limit = atoi(c->argv[6]->ptr);
5949 if (offset < 0) offset = 0;
5950 }
5951
5952 /* Ok, lookup the key and get the range */
5953 o = lookupKeyRead(c->db,c->argv[1]);
5954 if (o == NULL) {
5955 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5956 } else {
5957 if (o->type != REDIS_ZSET) {
5958 addReply(c,shared.wrongtypeerr);
5959 } else {
5960 zset *zsetobj = o->ptr;
5961 zskiplist *zsl = zsetobj->zsl;
5962 zskiplistNode *ln;
5963 robj *ele, *lenobj = NULL;
5964 unsigned long rangelen = 0;
5965
5966 /* Get the first node with the score >= min, or with
5967 * score > min if 'minex' is true. */
5968 ln = zslFirstWithScore(zsl,min);
5969 while (minex && ln && ln->score == min) ln = ln->forward[0];
5970
5971 if (ln == NULL) {
5972 /* No element matching the speciifed interval */
5973 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5974 return;
5975 }
5976
5977 /* We don't know in advance how many matching elements there
5978 * are in the list, so we push this object that will represent
5979 * the multi-bulk length in the output buffer, and will "fix"
5980 * it later */
5981 if (!justcount) {
5982 lenobj = createObject(REDIS_STRING,NULL);
5983 addReply(c,lenobj);
5984 decrRefCount(lenobj);
5985 }
5986
5987 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5988 if (offset) {
5989 offset--;
5990 ln = ln->forward[0];
5991 continue;
5992 }
5993 if (limit == 0) break;
5994 if (!justcount) {
5995 ele = ln->obj;
5996 addReplyBulk(c,ele);
5997 if (withscores)
5998 addReplyDouble(c,ln->score);
5999 }
6000 ln = ln->forward[0];
6001 rangelen++;
6002 if (limit > 0) limit--;
6003 }
6004 if (justcount) {
6005 addReplyLong(c,(long)rangelen);
6006 } else {
6007 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6008 withscores ? (rangelen*2) : rangelen);
6009 }
6010 }
6011 }
6012 }
6013
6014 static void zrangebyscoreCommand(redisClient *c) {
6015 genericZrangebyscoreCommand(c,0);
6016 }
6017
6018 static void zcountCommand(redisClient *c) {
6019 genericZrangebyscoreCommand(c,1);
6020 }
6021
6022 static void zcardCommand(redisClient *c) {
6023 robj *o;
6024 zset *zs;
6025
6026 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6027 checkType(c,o,REDIS_ZSET)) return;
6028
6029 zs = o->ptr;
6030 addReplyUlong(c,zs->zsl->length);
6031 }
6032
6033 static void zscoreCommand(redisClient *c) {
6034 robj *o;
6035 zset *zs;
6036 dictEntry *de;
6037
6038 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6039 checkType(c,o,REDIS_ZSET)) return;
6040
6041 zs = o->ptr;
6042 de = dictFind(zs->dict,c->argv[2]);
6043 if (!de) {
6044 addReply(c,shared.nullbulk);
6045 } else {
6046 double *score = dictGetEntryVal(de);
6047
6048 addReplyDouble(c,*score);
6049 }
6050 }
6051
6052 static void zrankGenericCommand(redisClient *c, int reverse) {
6053 robj *o;
6054 zset *zs;
6055 zskiplist *zsl;
6056 dictEntry *de;
6057 unsigned long rank;
6058 double *score;
6059
6060 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6061 checkType(c,o,REDIS_ZSET)) return;
6062
6063 zs = o->ptr;
6064 zsl = zs->zsl;
6065 de = dictFind(zs->dict,c->argv[2]);
6066 if (!de) {
6067 addReply(c,shared.nullbulk);
6068 return;
6069 }
6070
6071 score = dictGetEntryVal(de);
6072 rank = zslGetRank(zsl, *score, c->argv[2]);
6073 if (rank) {
6074 if (reverse) {
6075 addReplyLong(c, zsl->length - rank);
6076 } else {
6077 addReplyLong(c, rank-1);
6078 }
6079 } else {
6080 addReply(c,shared.nullbulk);
6081 }
6082 }
6083
6084 static void zrankCommand(redisClient *c) {
6085 zrankGenericCommand(c, 0);
6086 }
6087
6088 static void zrevrankCommand(redisClient *c) {
6089 zrankGenericCommand(c, 1);
6090 }
6091
6092 /* ========================= Hashes utility functions ======================= */
6093 #define REDIS_HASH_KEY 1
6094 #define REDIS_HASH_VALUE 2
6095
6096 /* Check the length of a number of objects to see if we need to convert a
6097 * zipmap to a real hash. Note that we only check string encoded objects
6098 * as their string length can be queried in constant time. */
6099 static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6100 int i;
6101 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
6102
6103 for (i = start; i <= end; i++) {
6104 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6105 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6106 {
6107 convertToRealHash(subject);
6108 return;
6109 }
6110 }
6111 }
6112
6113 /* Encode given objects in-place when the hash uses a dict. */
6114 static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6115 if (subject->encoding == REDIS_ENCODING_HT) {
6116 if (o1) *o1 = tryObjectEncoding(*o1);
6117 if (o2) *o2 = tryObjectEncoding(*o2);
6118 }
6119 }
6120
6121 /* Get the value from a hash identified by key. Returns either a string
6122 * object or NULL if the value cannot be found. The refcount of the object
6123 * is always increased by 1 when the value was found. */
6124 static robj *hashGet(robj *o, robj *key) {
6125 robj *value = NULL;
6126 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6127 unsigned char *v;
6128 unsigned int vlen;
6129 key = getDecodedObject(key);
6130 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6131 value = createStringObject((char*)v,vlen);
6132 }
6133 decrRefCount(key);
6134 } else {
6135 dictEntry *de = dictFind(o->ptr,key);
6136 if (de != NULL) {
6137 value = dictGetEntryVal(de);
6138 incrRefCount(value);
6139 }
6140 }
6141 return value;
6142 }
6143
6144 /* Test if the key exists in the given hash. Returns 1 if the key
6145 * exists and 0 when it doesn't. */
6146 static int hashExists(robj *o, robj *key) {
6147 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6148 key = getDecodedObject(key);
6149 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6150 decrRefCount(key);
6151 return 1;
6152 }
6153 decrRefCount(key);
6154 } else {
6155 if (dictFind(o->ptr,key) != NULL) {
6156 return 1;
6157 }
6158 }
6159 return 0;
6160 }
6161
6162 /* Add an element, discard the old if the key already exists.
6163 * Return 0 on insert and 1 on update. */
6164 static int hashSet(robj *o, robj *key, robj *value) {
6165 int update = 0;
6166 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6167 key = getDecodedObject(key);
6168 value = getDecodedObject(value);
6169 o->ptr = zipmapSet(o->ptr,
6170 key->ptr,sdslen(key->ptr),
6171 value->ptr,sdslen(value->ptr), &update);
6172 decrRefCount(key);
6173 decrRefCount(value);
6174
6175 /* Check if the zipmap needs to be upgraded to a real hash table */
6176 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
6177 convertToRealHash(o);
6178 } else {
6179 if (dictReplace(o->ptr,key,value)) {
6180 /* Insert */
6181 incrRefCount(key);
6182 } else {
6183 /* Update */
6184 update = 1;
6185 }
6186 incrRefCount(value);
6187 }
6188 return update;
6189 }
6190
6191 /* Delete an element from a hash.
6192 * Return 1 on deleted and 0 on not found. */
6193 static int hashDelete(robj *o, robj *key) {
6194 int deleted = 0;
6195 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6196 key = getDecodedObject(key);
6197 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6198 decrRefCount(key);
6199 } else {
6200 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6201 /* Always check if the dictionary needs a resize after a delete. */
6202 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
6203 }
6204 return deleted;
6205 }
6206
6207 /* Return the number of elements in a hash. */
6208 static unsigned long hashLength(robj *o) {
6209 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6210 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6211 }
6212
6213 /* Structure to hold hash iteration abstration. Note that iteration over
6214 * hashes involves both fields and values. Because it is possible that
6215 * not both are required, store pointers in the iterator to avoid
6216 * unnecessary memory allocation for fields/values. */
6217 typedef struct {
6218 int encoding;
6219 unsigned char *zi;
6220 unsigned char *zk, *zv;
6221 unsigned int zklen, zvlen;
6222
6223 dictIterator *di;
6224 dictEntry *de;
6225 } hashIterator;
6226
6227 static hashIterator *hashInitIterator(robj *subject) {
6228 hashIterator *hi = zmalloc(sizeof(hashIterator));
6229 hi->encoding = subject->encoding;
6230 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6231 hi->zi = zipmapRewind(subject->ptr);
6232 } else if (hi->encoding == REDIS_ENCODING_HT) {
6233 hi->di = dictGetIterator(subject->ptr);
6234 } else {
6235 redisAssert(NULL);
6236 }
6237 return hi;
6238 }
6239
6240 static void hashReleaseIterator(hashIterator *hi) {
6241 if (hi->encoding == REDIS_ENCODING_HT) {
6242 dictReleaseIterator(hi->di);
6243 }
6244 zfree(hi);
6245 }
6246
6247 /* Move to the next entry in the hash. Return REDIS_OK when the next entry
6248 * could be found and REDIS_ERR when the iterator reaches the end. */
6249 static int hashNext(hashIterator *hi) {
6250 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6251 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6252 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6253 } else {
6254 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6255 }
6256 return REDIS_OK;
6257 }
6258
6259 /* Get key or value object at current iteration position.
6260 * This increases the refcount of the field object by 1. */
6261 static robj *hashCurrent(hashIterator *hi, int what) {
6262 robj *o;
6263 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6264 if (what & REDIS_HASH_KEY) {
6265 o = createStringObject((char*)hi->zk,hi->zklen);
6266 } else {
6267 o = createStringObject((char*)hi->zv,hi->zvlen);
6268 }
6269 } else {
6270 if (what & REDIS_HASH_KEY) {
6271 o = dictGetEntryKey(hi->de);
6272 } else {
6273 o = dictGetEntryVal(hi->de);
6274 }
6275 incrRefCount(o);
6276 }
6277 return o;
6278 }
6279
6280 static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6281 robj *o = lookupKeyWrite(c->db,key);
6282 if (o == NULL) {
6283 o = createHashObject();
6284 dictAdd(c->db->dict,key,o);
6285 incrRefCount(key);
6286 } else {
6287 if (o->type != REDIS_HASH) {
6288 addReply(c,shared.wrongtypeerr);
6289 return NULL;
6290 }
6291 }
6292 return o;
6293 }
6294
6295 /* ============================= Hash commands ============================== */
6296 static void hsetCommand(redisClient *c) {
6297 int update;
6298 robj *o;
6299
6300 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6301 hashTryConversion(o,c->argv,2,3);
6302 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6303 update = hashSet(o,c->argv[2],c->argv[3]);
6304 addReply(c, update ? shared.czero : shared.cone);
6305 server.dirty++;
6306 }
6307
6308 static void hsetnxCommand(redisClient *c) {
6309 robj *o;
6310 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6311 hashTryConversion(o,c->argv,2,3);
6312
6313 if (hashExists(o, c->argv[2])) {
6314 addReply(c, shared.czero);
6315 } else {
6316 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
6317 hashSet(o,c->argv[2],c->argv[3]);
6318 addReply(c, shared.cone);
6319 server.dirty++;
6320 }
6321 }
6322
6323 static void hmsetCommand(redisClient *c) {
6324 int i;
6325 robj *o;
6326
6327 if ((c->argc % 2) == 1) {
6328 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6329 return;
6330 }
6331
6332 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6333 hashTryConversion(o,c->argv,2,c->argc-1);
6334 for (i = 2; i < c->argc; i += 2) {
6335 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
6336 hashSet(o,c->argv[i],c->argv[i+1]);
6337 }
6338 addReply(c, shared.ok);
6339 server.dirty++;
6340 }
6341
6342 static void hincrbyCommand(redisClient *c) {
6343 long long value, incr;
6344 robj *o, *current, *new;
6345
6346 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
6347 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6348 if ((current = hashGet(o,c->argv[2])) != NULL) {
6349 if (current->encoding == REDIS_ENCODING_RAW)
6350 value = strtoll(current->ptr,NULL,10);
6351 else if (current->encoding == REDIS_ENCODING_INT)
6352 value = (long)current->ptr;
6353 else
6354 redisAssert(1 != 1);
6355 decrRefCount(current);
6356 } else {
6357 value = 0;
6358 }
6359
6360 value += incr;
6361 new = createStringObjectFromLongLong(value);
6362 hashTryObjectEncoding(o,&c->argv[2],NULL);
6363 hashSet(o,c->argv[2],new);
6364 decrRefCount(new);
6365 addReplyLongLong(c,value);
6366 server.dirty++;
6367 }
6368
6369 static void hgetCommand(redisClient *c) {
6370 robj *o, *value;
6371 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6372 checkType(c,o,REDIS_HASH)) return;
6373
6374 if ((value = hashGet(o,c->argv[2])) != NULL) {
6375 addReplyBulk(c,value);
6376 decrRefCount(value);
6377 } else {
6378 addReply(c,shared.nullbulk);
6379 }
6380 }
6381
6382 static void hmgetCommand(redisClient *c) {
6383 int i;
6384 robj *o, *value;
6385 o = lookupKeyRead(c->db,c->argv[1]);
6386 if (o != NULL && o->type != REDIS_HASH) {
6387 addReply(c,shared.wrongtypeerr);
6388 }
6389
6390 /* Note the check for o != NULL happens inside the loop. This is
6391 * done because objects that cannot be found are considered to be
6392 * an empty hash. The reply should then be a series of NULLs. */
6393 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6394 for (i = 2; i < c->argc; i++) {
6395 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6396 addReplyBulk(c,value);
6397 decrRefCount(value);
6398 } else {
6399 addReply(c,shared.nullbulk);
6400 }
6401 }
6402 }
6403
6404 static void hdelCommand(redisClient *c) {
6405 robj *o;
6406 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6407 checkType(c,o,REDIS_HASH)) return;
6408
6409 if (hashDelete(o,c->argv[2])) {
6410 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6411 addReply(c,shared.cone);
6412 server.dirty++;
6413 } else {
6414 addReply(c,shared.czero);
6415 }
6416 }
6417
6418 static void hlenCommand(redisClient *c) {
6419 robj *o;
6420 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6421 checkType(c,o,REDIS_HASH)) return;
6422
6423 addReplyUlong(c,hashLength(o));
6424 }
6425
6426 static void genericHgetallCommand(redisClient *c, int flags) {
6427 robj *o, *lenobj, *obj;
6428 unsigned long count = 0;
6429 hashIterator *hi;
6430
6431 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6432 || checkType(c,o,REDIS_HASH)) return;
6433
6434 lenobj = createObject(REDIS_STRING,NULL);
6435 addReply(c,lenobj);
6436 decrRefCount(lenobj);
6437
6438 hi = hashInitIterator(o);
6439 while (hashNext(hi) != REDIS_ERR) {
6440 if (flags & REDIS_HASH_KEY) {
6441 obj = hashCurrent(hi,REDIS_HASH_KEY);
6442 addReplyBulk(c,obj);
6443 decrRefCount(obj);
6444 count++;
6445 }
6446 if (flags & REDIS_HASH_VALUE) {
6447 obj = hashCurrent(hi,REDIS_HASH_VALUE);
6448 addReplyBulk(c,obj);
6449 decrRefCount(obj);
6450 count++;
6451 }
6452 }
6453 hashReleaseIterator(hi);
6454
6455 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6456 }
6457
6458 static void hkeysCommand(redisClient *c) {
6459 genericHgetallCommand(c,REDIS_HASH_KEY);
6460 }
6461
6462 static void hvalsCommand(redisClient *c) {
6463 genericHgetallCommand(c,REDIS_HASH_VALUE);
6464 }
6465
6466 static void hgetallCommand(redisClient *c) {
6467 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
6468 }
6469
6470 static void hexistsCommand(redisClient *c) {
6471 robj *o;
6472 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6473 checkType(c,o,REDIS_HASH)) return;
6474
6475 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
6476 }
6477
6478 static void convertToRealHash(robj *o) {
6479 unsigned char *key, *val, *p, *zm = o->ptr;
6480 unsigned int klen, vlen;
6481 dict *dict = dictCreate(&hashDictType,NULL);
6482
6483 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6484 p = zipmapRewind(zm);
6485 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6486 robj *keyobj, *valobj;
6487
6488 keyobj = createStringObject((char*)key,klen);
6489 valobj = createStringObject((char*)val,vlen);
6490 keyobj = tryObjectEncoding(keyobj);
6491 valobj = tryObjectEncoding(valobj);
6492 dictAdd(dict,keyobj,valobj);
6493 }
6494 o->encoding = REDIS_ENCODING_HT;
6495 o->ptr = dict;
6496 zfree(zm);
6497 }
6498
6499 /* ========================= Non type-specific commands ==================== */
6500
6501 static void flushdbCommand(redisClient *c) {
6502 server.dirty += dictSize(c->db->dict);
6503 dictEmpty(c->db->dict);
6504 dictEmpty(c->db->expires);
6505 addReply(c,shared.ok);
6506 }
6507
6508 static void flushallCommand(redisClient *c) {
6509 server.dirty += emptyDb();
6510 addReply(c,shared.ok);
6511 if (server.bgsavechildpid != -1) {
6512 kill(server.bgsavechildpid,SIGKILL);
6513 rdbRemoveTempFile(server.bgsavechildpid);
6514 }
6515 rdbSave(server.dbfilename);
6516 server.dirty++;
6517 }
6518
6519 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6520 redisSortOperation *so = zmalloc(sizeof(*so));
6521 so->type = type;
6522 so->pattern = pattern;
6523 return so;
6524 }
6525
6526 /* Return the value associated to the key with a name obtained
6527 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6528 * The returned object will always have its refcount increased by 1
6529 * when it is non-NULL. */
6530 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6531 char *p, *f;
6532 sds spat, ssub;
6533 robj keyobj, fieldobj, *o;
6534 int prefixlen, sublen, postfixlen, fieldlen;
6535 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6536 struct {
6537 long len;
6538 long free;
6539 char buf[REDIS_SORTKEY_MAX+1];
6540 } keyname, fieldname;
6541
6542 /* If the pattern is "#" return the substitution object itself in order
6543 * to implement the "SORT ... GET #" feature. */
6544 spat = pattern->ptr;
6545 if (spat[0] == '#' && spat[1] == '\0') {
6546 incrRefCount(subst);
6547 return subst;
6548 }
6549
6550 /* The substitution object may be specially encoded. If so we create
6551 * a decoded object on the fly. Otherwise getDecodedObject will just
6552 * increment the ref count, that we'll decrement later. */
6553 subst = getDecodedObject(subst);
6554
6555 ssub = subst->ptr;
6556 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6557 p = strchr(spat,'*');
6558 if (!p) {
6559 decrRefCount(subst);
6560 return NULL;
6561 }
6562
6563 /* Find out if we're dealing with a hash dereference. */
6564 if ((f = strstr(p+1, "->")) != NULL) {
6565 fieldlen = sdslen(spat)-(f-spat);
6566 /* this also copies \0 character */
6567 memcpy(fieldname.buf,f+2,fieldlen-1);
6568 fieldname.len = fieldlen-2;
6569 } else {
6570 fieldlen = 0;
6571 }
6572
6573 prefixlen = p-spat;
6574 sublen = sdslen(ssub);
6575 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
6576 memcpy(keyname.buf,spat,prefixlen);
6577 memcpy(keyname.buf+prefixlen,ssub,sublen);
6578 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6579 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6580 keyname.len = prefixlen+sublen+postfixlen;
6581 decrRefCount(subst);
6582
6583 /* Lookup substituted key */
6584 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6585 o = lookupKeyRead(db,&keyobj);
6586 if (o == NULL) return NULL;
6587
6588 if (fieldlen > 0) {
6589 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6590
6591 /* Retrieve value from hash by the field name. This operation
6592 * already increases the refcount of the returned object. */
6593 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6594 o = hashGet(o, &fieldobj);
6595 } else {
6596 if (o->type != REDIS_STRING) return NULL;
6597
6598 /* Every object that this function returns needs to have its refcount
6599 * increased. sortCommand decreases it again. */
6600 incrRefCount(o);
6601 }
6602
6603 return o;
6604 }
6605
6606 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6607 * the additional parameter is not standard but a BSD-specific we have to
6608 * pass sorting parameters via the global 'server' structure */
6609 static int sortCompare(const void *s1, const void *s2) {
6610 const redisSortObject *so1 = s1, *so2 = s2;
6611 int cmp;
6612
6613 if (!server.sort_alpha) {
6614 /* Numeric sorting. Here it's trivial as we precomputed scores */
6615 if (so1->u.score > so2->u.score) {
6616 cmp = 1;
6617 } else if (so1->u.score < so2->u.score) {
6618 cmp = -1;
6619 } else {
6620 cmp = 0;
6621 }
6622 } else {
6623 /* Alphanumeric sorting */
6624 if (server.sort_bypattern) {
6625 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6626 /* At least one compare object is NULL */
6627 if (so1->u.cmpobj == so2->u.cmpobj)
6628 cmp = 0;
6629 else if (so1->u.cmpobj == NULL)
6630 cmp = -1;
6631 else
6632 cmp = 1;
6633 } else {
6634 /* We have both the objects, use strcoll */
6635 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6636 }
6637 } else {
6638 /* Compare elements directly. */
6639 cmp = compareStringObjects(so1->obj,so2->obj);
6640 }
6641 }
6642 return server.sort_desc ? -cmp : cmp;
6643 }
6644
6645 /* The SORT command is the most complex command in Redis. Warning: this code
6646 * is optimized for speed and a bit less for readability */
6647 static void sortCommand(redisClient *c) {
6648 list *operations;
6649 int outputlen = 0;
6650 int desc = 0, alpha = 0;
6651 int limit_start = 0, limit_count = -1, start, end;
6652 int j, dontsort = 0, vectorlen;
6653 int getop = 0; /* GET operation counter */
6654 robj *sortval, *sortby = NULL, *storekey = NULL;
6655 redisSortObject *vector; /* Resulting vector to sort */
6656
6657 /* Lookup the key to sort. It must be of the right types */
6658 sortval = lookupKeyRead(c->db,c->argv[1]);
6659 if (sortval == NULL) {
6660 addReply(c,shared.emptymultibulk);
6661 return;
6662 }
6663 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6664 sortval->type != REDIS_ZSET)
6665 {
6666 addReply(c,shared.wrongtypeerr);
6667 return;
6668 }
6669
6670 /* Create a list of operations to perform for every sorted element.
6671 * Operations can be GET/DEL/INCR/DECR */
6672 operations = listCreate();
6673 listSetFreeMethod(operations,zfree);
6674 j = 2;
6675
6676 /* Now we need to protect sortval incrementing its count, in the future
6677 * SORT may have options able to overwrite/delete keys during the sorting
6678 * and the sorted key itself may get destroied */
6679 incrRefCount(sortval);
6680
6681 /* The SORT command has an SQL-alike syntax, parse it */
6682 while(j < c->argc) {
6683 int leftargs = c->argc-j-1;
6684 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6685 desc = 0;
6686 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6687 desc = 1;
6688 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6689 alpha = 1;
6690 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6691 limit_start = atoi(c->argv[j+1]->ptr);
6692 limit_count = atoi(c->argv[j+2]->ptr);
6693 j+=2;
6694 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6695 storekey = c->argv[j+1];
6696 j++;
6697 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6698 sortby = c->argv[j+1];
6699 /* If the BY pattern does not contain '*', i.e. it is constant,
6700 * we don't need to sort nor to lookup the weight keys. */
6701 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6702 j++;
6703 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6704 listAddNodeTail(operations,createSortOperation(
6705 REDIS_SORT_GET,c->argv[j+1]));
6706 getop++;
6707 j++;
6708 } else {
6709 decrRefCount(sortval);
6710 listRelease(operations);
6711 addReply(c,shared.syntaxerr);
6712 return;
6713 }
6714 j++;
6715 }
6716
6717 /* Load the sorting vector with all the objects to sort */
6718 switch(sortval->type) {
6719 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6720 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6721 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6722 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
6723 }
6724 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6725 j = 0;
6726
6727 if (sortval->type == REDIS_LIST) {
6728 list *list = sortval->ptr;
6729 listNode *ln;
6730 listIter li;
6731
6732 listRewind(list,&li);
6733 while((ln = listNext(&li))) {
6734 robj *ele = ln->value;
6735 vector[j].obj = ele;
6736 vector[j].u.score = 0;
6737 vector[j].u.cmpobj = NULL;
6738 j++;
6739 }
6740 } else {
6741 dict *set;
6742 dictIterator *di;
6743 dictEntry *setele;
6744
6745 if (sortval->type == REDIS_SET) {
6746 set = sortval->ptr;
6747 } else {
6748 zset *zs = sortval->ptr;
6749 set = zs->dict;
6750 }
6751
6752 di = dictGetIterator(set);
6753 while((setele = dictNext(di)) != NULL) {
6754 vector[j].obj = dictGetEntryKey(setele);
6755 vector[j].u.score = 0;
6756 vector[j].u.cmpobj = NULL;
6757 j++;
6758 }
6759 dictReleaseIterator(di);
6760 }
6761 redisAssert(j == vectorlen);
6762
6763 /* Now it's time to load the right scores in the sorting vector */
6764 if (dontsort == 0) {
6765 for (j = 0; j < vectorlen; j++) {
6766 robj *byval;
6767 if (sortby) {
6768 /* lookup value to sort by */
6769 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6770 if (!byval) continue;
6771 } else {
6772 /* use object itself to sort by */
6773 byval = vector[j].obj;
6774 }
6775
6776 if (alpha) {
6777 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6778 } else {
6779 if (byval->encoding == REDIS_ENCODING_RAW) {
6780 vector[j].u.score = strtod(byval->ptr,NULL);
6781 } else if (byval->encoding == REDIS_ENCODING_INT) {
6782 /* Don't need to decode the object if it's
6783 * integer-encoded (the only encoding supported) so
6784 * far. We can just cast it */
6785 vector[j].u.score = (long)byval->ptr;
6786 } else {
6787 redisAssert(1 != 1);
6788 }
6789 }
6790
6791 /* when the object was retrieved using lookupKeyByPattern,
6792 * its refcount needs to be decreased. */
6793 if (sortby) {
6794 decrRefCount(byval);
6795 }
6796 }
6797 }
6798
6799 /* We are ready to sort the vector... perform a bit of sanity check
6800 * on the LIMIT option too. We'll use a partial version of quicksort. */
6801 start = (limit_start < 0) ? 0 : limit_start;
6802 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6803 if (start >= vectorlen) {
6804 start = vectorlen-1;
6805 end = vectorlen-2;
6806 }
6807 if (end >= vectorlen) end = vectorlen-1;
6808
6809 if (dontsort == 0) {
6810 server.sort_desc = desc;
6811 server.sort_alpha = alpha;
6812 server.sort_bypattern = sortby ? 1 : 0;
6813 if (sortby && (start != 0 || end != vectorlen-1))
6814 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6815 else
6816 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6817 }
6818
6819 /* Send command output to the output buffer, performing the specified
6820 * GET/DEL/INCR/DECR operations if any. */
6821 outputlen = getop ? getop*(end-start+1) : end-start+1;
6822 if (storekey == NULL) {
6823 /* STORE option not specified, sent the sorting result to client */
6824 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6825 for (j = start; j <= end; j++) {
6826 listNode *ln;
6827 listIter li;
6828
6829 if (!getop) addReplyBulk(c,vector[j].obj);
6830 listRewind(operations,&li);
6831 while((ln = listNext(&li))) {
6832 redisSortOperation *sop = ln->value;
6833 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6834 vector[j].obj);
6835
6836 if (sop->type == REDIS_SORT_GET) {
6837 if (!val) {
6838 addReply(c,shared.nullbulk);
6839 } else {
6840 addReplyBulk(c,val);
6841 decrRefCount(val);
6842 }
6843 } else {
6844 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6845 }
6846 }
6847 }
6848 } else {
6849 robj *listObject = createListObject();
6850 list *listPtr = (list*) listObject->ptr;
6851
6852 /* STORE option specified, set the sorting result as a List object */
6853 for (j = start; j <= end; j++) {
6854 listNode *ln;
6855 listIter li;
6856
6857 if (!getop) {
6858 listAddNodeTail(listPtr,vector[j].obj);
6859 incrRefCount(vector[j].obj);
6860 }
6861 listRewind(operations,&li);
6862 while((ln = listNext(&li))) {
6863 redisSortOperation *sop = ln->value;
6864 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6865 vector[j].obj);
6866
6867 if (sop->type == REDIS_SORT_GET) {
6868 if (!val) {
6869 listAddNodeTail(listPtr,createStringObject("",0));
6870 } else {
6871 /* We should do a incrRefCount on val because it is
6872 * added to the list, but also a decrRefCount because
6873 * it is returned by lookupKeyByPattern. This results
6874 * in doing nothing at all. */
6875 listAddNodeTail(listPtr,val);
6876 }
6877 } else {
6878 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6879 }
6880 }
6881 }
6882 if (dictReplace(c->db->dict,storekey,listObject)) {
6883 incrRefCount(storekey);
6884 }
6885 /* Note: we add 1 because the DB is dirty anyway since even if the
6886 * SORT result is empty a new key is set and maybe the old content
6887 * replaced. */
6888 server.dirty += 1+outputlen;
6889 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6890 }
6891
6892 /* Cleanup */
6893 decrRefCount(sortval);
6894 listRelease(operations);
6895 for (j = 0; j < vectorlen; j++) {
6896 if (alpha && vector[j].u.cmpobj)
6897 decrRefCount(vector[j].u.cmpobj);
6898 }
6899 zfree(vector);
6900 }
6901
6902 /* Convert an amount of bytes into a human readable string in the form
6903 * of 100B, 2G, 100M, 4K, and so forth. */
6904 static void bytesToHuman(char *s, unsigned long long n) {
6905 double d;
6906
6907 if (n < 1024) {
6908 /* Bytes */
6909 sprintf(s,"%lluB",n);
6910 return;
6911 } else if (n < (1024*1024)) {
6912 d = (double)n/(1024);
6913 sprintf(s,"%.2fK",d);
6914 } else if (n < (1024LL*1024*1024)) {
6915 d = (double)n/(1024*1024);
6916 sprintf(s,"%.2fM",d);
6917 } else if (n < (1024LL*1024*1024*1024)) {
6918 d = (double)n/(1024LL*1024*1024);
6919 sprintf(s,"%.2fG",d);
6920 }
6921 }
6922
6923 /* Create the string returned by the INFO command. This is decoupled
6924 * by the INFO command itself as we need to report the same information
6925 * on memory corruption problems. */
6926 static sds genRedisInfoString(void) {
6927 sds info;
6928 time_t uptime = time(NULL)-server.stat_starttime;
6929 int j;
6930 char hmem[64];
6931
6932 bytesToHuman(hmem,zmalloc_used_memory());
6933 info = sdscatprintf(sdsempty(),
6934 "redis_version:%s\r\n"
6935 "arch_bits:%s\r\n"
6936 "multiplexing_api:%s\r\n"
6937 "process_id:%ld\r\n"
6938 "uptime_in_seconds:%ld\r\n"
6939 "uptime_in_days:%ld\r\n"
6940 "connected_clients:%d\r\n"
6941 "connected_slaves:%d\r\n"
6942 "blocked_clients:%d\r\n"
6943 "used_memory:%zu\r\n"
6944 "used_memory_human:%s\r\n"
6945 "changes_since_last_save:%lld\r\n"
6946 "bgsave_in_progress:%d\r\n"
6947 "last_save_time:%ld\r\n"
6948 "bgrewriteaof_in_progress:%d\r\n"
6949 "total_connections_received:%lld\r\n"
6950 "total_commands_processed:%lld\r\n"
6951 "expired_keys:%lld\r\n"
6952 "hash_max_zipmap_entries:%ld\r\n"
6953 "hash_max_zipmap_value:%ld\r\n"
6954 "pubsub_channels:%ld\r\n"
6955 "pubsub_patterns:%u\r\n"
6956 "vm_enabled:%d\r\n"
6957 "role:%s\r\n"
6958 ,REDIS_VERSION,
6959 (sizeof(long) == 8) ? "64" : "32",
6960 aeGetApiName(),
6961 (long) getpid(),
6962 uptime,
6963 uptime/(3600*24),
6964 listLength(server.clients)-listLength(server.slaves),
6965 listLength(server.slaves),
6966 server.blpop_blocked_clients,
6967 zmalloc_used_memory(),
6968 hmem,
6969 server.dirty,
6970 server.bgsavechildpid != -1,
6971 server.lastsave,
6972 server.bgrewritechildpid != -1,
6973 server.stat_numconnections,
6974 server.stat_numcommands,
6975 server.stat_expiredkeys,
6976 server.hash_max_zipmap_entries,
6977 server.hash_max_zipmap_value,
6978 dictSize(server.pubsub_channels),
6979 listLength(server.pubsub_patterns),
6980 server.vm_enabled != 0,
6981 server.masterhost == NULL ? "master" : "slave"
6982 );
6983 if (server.masterhost) {
6984 info = sdscatprintf(info,
6985 "master_host:%s\r\n"
6986 "master_port:%d\r\n"
6987 "master_link_status:%s\r\n"
6988 "master_last_io_seconds_ago:%d\r\n"
6989 ,server.masterhost,
6990 server.masterport,
6991 (server.replstate == REDIS_REPL_CONNECTED) ?
6992 "up" : "down",
6993 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6994 );
6995 }
6996 if (server.vm_enabled) {
6997 lockThreadedIO();
6998 info = sdscatprintf(info,
6999 "vm_conf_max_memory:%llu\r\n"
7000 "vm_conf_page_size:%llu\r\n"
7001 "vm_conf_pages:%llu\r\n"
7002 "vm_stats_used_pages:%llu\r\n"
7003 "vm_stats_swapped_objects:%llu\r\n"
7004 "vm_stats_swappin_count:%llu\r\n"
7005 "vm_stats_swappout_count:%llu\r\n"
7006 "vm_stats_io_newjobs_len:%lu\r\n"
7007 "vm_stats_io_processing_len:%lu\r\n"
7008 "vm_stats_io_processed_len:%lu\r\n"
7009 "vm_stats_io_active_threads:%lu\r\n"
7010 "vm_stats_blocked_clients:%lu\r\n"
7011 ,(unsigned long long) server.vm_max_memory,
7012 (unsigned long long) server.vm_page_size,
7013 (unsigned long long) server.vm_pages,
7014 (unsigned long long) server.vm_stats_used_pages,
7015 (unsigned long long) server.vm_stats_swapped_objects,
7016 (unsigned long long) server.vm_stats_swapins,
7017 (unsigned long long) server.vm_stats_swapouts,
7018 (unsigned long) listLength(server.io_newjobs),
7019 (unsigned long) listLength(server.io_processing),
7020 (unsigned long) listLength(server.io_processed),
7021 (unsigned long) server.io_active_threads,
7022 (unsigned long) server.vm_blocked_clients
7023 );
7024 unlockThreadedIO();
7025 }
7026 for (j = 0; j < server.dbnum; j++) {
7027 long long keys, vkeys;
7028
7029 keys = dictSize(server.db[j].dict);
7030 vkeys = dictSize(server.db[j].expires);
7031 if (keys || vkeys) {
7032 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
7033 j, keys, vkeys);
7034 }
7035 }
7036 return info;
7037 }
7038
7039 static void infoCommand(redisClient *c) {
7040 sds info = genRedisInfoString();
7041 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7042 (unsigned long)sdslen(info)));
7043 addReplySds(c,info);
7044 addReply(c,shared.crlf);
7045 }
7046
7047 static void monitorCommand(redisClient *c) {
7048 /* ignore MONITOR if aleady slave or in monitor mode */
7049 if (c->flags & REDIS_SLAVE) return;
7050
7051 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7052 c->slaveseldb = 0;
7053 listAddNodeTail(server.monitors,c);
7054 addReply(c,shared.ok);
7055 }
7056
7057 /* ================================= Expire ================================= */
7058 static int removeExpire(redisDb *db, robj *key) {
7059 if (dictDelete(db->expires,key) == DICT_OK) {
7060 return 1;
7061 } else {
7062 return 0;
7063 }
7064 }
7065
7066 static int setExpire(redisDb *db, robj *key, time_t when) {
7067 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7068 return 0;
7069 } else {
7070 incrRefCount(key);
7071 return 1;
7072 }
7073 }
7074
7075 /* Return the expire time of the specified key, or -1 if no expire
7076 * is associated with this key (i.e. the key is non volatile) */
7077 static time_t getExpire(redisDb *db, robj *key) {
7078 dictEntry *de;
7079
7080 /* No expire? return ASAP */
7081 if (dictSize(db->expires) == 0 ||
7082 (de = dictFind(db->expires,key)) == NULL) return -1;
7083
7084 return (time_t) dictGetEntryVal(de);
7085 }
7086
7087 static int expireIfNeeded(redisDb *db, robj *key) {
7088 time_t when;
7089 dictEntry *de;
7090
7091 /* No expire? return ASAP */
7092 if (dictSize(db->expires) == 0 ||
7093 (de = dictFind(db->expires,key)) == NULL) return 0;
7094
7095 /* Lookup the expire */
7096 when = (time_t) dictGetEntryVal(de);
7097 if (time(NULL) <= when) return 0;
7098
7099 /* Delete the key */
7100 dictDelete(db->expires,key);
7101 server.stat_expiredkeys++;
7102 return dictDelete(db->dict,key) == DICT_OK;
7103 }
7104
7105 static int deleteIfVolatile(redisDb *db, robj *key) {
7106 dictEntry *de;
7107
7108 /* No expire? return ASAP */
7109 if (dictSize(db->expires) == 0 ||
7110 (de = dictFind(db->expires,key)) == NULL) return 0;
7111
7112 /* Delete the key */
7113 server.dirty++;
7114 server.stat_expiredkeys++;
7115 dictDelete(db->expires,key);
7116 return dictDelete(db->dict,key) == DICT_OK;
7117 }
7118
7119 static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
7120 dictEntry *de;
7121 time_t seconds;
7122
7123 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
7124
7125 seconds -= offset;
7126
7127 de = dictFind(c->db->dict,key);
7128 if (de == NULL) {
7129 addReply(c,shared.czero);
7130 return;
7131 }
7132 if (seconds <= 0) {
7133 if (deleteKey(c->db,key)) server.dirty++;
7134 addReply(c, shared.cone);
7135 return;
7136 } else {
7137 time_t when = time(NULL)+seconds;
7138 if (setExpire(c->db,key,when)) {
7139 addReply(c,shared.cone);
7140 server.dirty++;
7141 } else {
7142 addReply(c,shared.czero);
7143 }
7144 return;
7145 }
7146 }
7147
7148 static void expireCommand(redisClient *c) {
7149 expireGenericCommand(c,c->argv[1],c->argv[2],0);
7150 }
7151
7152 static void expireatCommand(redisClient *c) {
7153 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7154 }
7155
7156 static void ttlCommand(redisClient *c) {
7157 time_t expire;
7158 int ttl = -1;
7159
7160 expire = getExpire(c->db,c->argv[1]);
7161 if (expire != -1) {
7162 ttl = (int) (expire-time(NULL));
7163 if (ttl < 0) ttl = -1;
7164 }
7165 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7166 }
7167
7168 /* ================================ MULTI/EXEC ============================== */
7169
7170 /* Client state initialization for MULTI/EXEC */
7171 static void initClientMultiState(redisClient *c) {
7172 c->mstate.commands = NULL;
7173 c->mstate.count = 0;
7174 }
7175
7176 /* Release all the resources associated with MULTI/EXEC state */
7177 static void freeClientMultiState(redisClient *c) {
7178 int j;
7179
7180 for (j = 0; j < c->mstate.count; j++) {
7181 int i;
7182 multiCmd *mc = c->mstate.commands+j;
7183
7184 for (i = 0; i < mc->argc; i++)
7185 decrRefCount(mc->argv[i]);
7186 zfree(mc->argv);
7187 }
7188 zfree(c->mstate.commands);
7189 }
7190
7191 /* Add a new command into the MULTI commands queue */
7192 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7193 multiCmd *mc;
7194 int j;
7195
7196 c->mstate.commands = zrealloc(c->mstate.commands,
7197 sizeof(multiCmd)*(c->mstate.count+1));
7198 mc = c->mstate.commands+c->mstate.count;
7199 mc->cmd = cmd;
7200 mc->argc = c->argc;
7201 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7202 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7203 for (j = 0; j < c->argc; j++)
7204 incrRefCount(mc->argv[j]);
7205 c->mstate.count++;
7206 }
7207
7208 static void multiCommand(redisClient *c) {
7209 c->flags |= REDIS_MULTI;
7210 addReply(c,shared.ok);
7211 }
7212
7213 static void discardCommand(redisClient *c) {
7214 if (!(c->flags & REDIS_MULTI)) {
7215 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7216 return;
7217 }
7218
7219 freeClientMultiState(c);
7220 initClientMultiState(c);
7221 c->flags &= (~REDIS_MULTI);
7222 addReply(c,shared.ok);
7223 }
7224
7225 /* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7226 * implememntation for more information. */
7227 static void execCommandReplicateMulti(redisClient *c) {
7228 struct redisCommand *cmd;
7229 robj *multistring = createStringObject("MULTI",5);
7230
7231 cmd = lookupCommand("multi");
7232 if (server.appendonly)
7233 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7234 if (listLength(server.slaves))
7235 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7236 decrRefCount(multistring);
7237 }
7238
7239 static void execCommand(redisClient *c) {
7240 int j;
7241 robj **orig_argv;
7242 int orig_argc;
7243
7244 if (!(c->flags & REDIS_MULTI)) {
7245 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7246 return;
7247 }
7248
7249 /* Replicate a MULTI request now that we are sure the block is executed.
7250 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7251 * both the AOF and the replication link will have the same consistency
7252 * and atomicity guarantees. */
7253 execCommandReplicateMulti(c);
7254
7255 /* Exec all the queued commands */
7256 orig_argv = c->argv;
7257 orig_argc = c->argc;
7258 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7259 for (j = 0; j < c->mstate.count; j++) {
7260 c->argc = c->mstate.commands[j].argc;
7261 c->argv = c->mstate.commands[j].argv;
7262 call(c,c->mstate.commands[j].cmd);
7263 }
7264 c->argv = orig_argv;
7265 c->argc = orig_argc;
7266 freeClientMultiState(c);
7267 initClientMultiState(c);
7268 c->flags &= (~REDIS_MULTI);
7269 /* Make sure the EXEC command is always replicated / AOF, since we
7270 * always send the MULTI command (we can't know beforehand if the
7271 * next operations will contain at least a modification to the DB). */
7272 server.dirty++;
7273 }
7274
7275 /* =========================== Blocking Operations ========================= */
7276
7277 /* Currently Redis blocking operations support is limited to list POP ops,
7278 * so the current implementation is not fully generic, but it is also not
7279 * completely specific so it will not require a rewrite to support new
7280 * kind of blocking operations in the future.
7281 *
7282 * Still it's important to note that list blocking operations can be already
7283 * used as a notification mechanism in order to implement other blocking
7284 * operations at application level, so there must be a very strong evidence
7285 * of usefulness and generality before new blocking operations are implemented.
7286 *
7287 * This is how the current blocking POP works, we use BLPOP as example:
7288 * - If the user calls BLPOP and the key exists and contains a non empty list
7289 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7290 * if there is not to block.
7291 * - If instead BLPOP is called and the key does not exists or the list is
7292 * empty we need to block. In order to do so we remove the notification for
7293 * new data to read in the client socket (so that we'll not serve new
7294 * requests if the blocking request is not served). Also we put the client
7295 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
7296 * blocking for this keys.
7297 * - If a PUSH operation against a key with blocked clients waiting is
7298 * performed, we serve the first in the list: basically instead to push
7299 * the new element inside the list we return it to the (first / oldest)
7300 * blocking client, unblock the client, and remove it form the list.
7301 *
7302 * The above comment and the source code should be enough in order to understand
7303 * the implementation and modify / fix it later.
7304 */
7305
7306 /* Set a client in blocking mode for the specified key, with the specified
7307 * timeout */
7308 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
7309 dictEntry *de;
7310 list *l;
7311 int j;
7312
7313 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7314 c->blockingkeysnum = numkeys;
7315 c->blockingto = timeout;
7316 for (j = 0; j < numkeys; j++) {
7317 /* Add the key in the client structure, to map clients -> keys */
7318 c->blockingkeys[j] = keys[j];
7319 incrRefCount(keys[j]);
7320
7321 /* And in the other "side", to map keys -> clients */
7322 de = dictFind(c->db->blockingkeys,keys[j]);
7323 if (de == NULL) {
7324 int retval;
7325
7326 /* For every key we take a list of clients blocked for it */
7327 l = listCreate();
7328 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7329 incrRefCount(keys[j]);
7330 assert(retval == DICT_OK);
7331 } else {
7332 l = dictGetEntryVal(de);
7333 }
7334 listAddNodeTail(l,c);
7335 }
7336 /* Mark the client as a blocked client */
7337 c->flags |= REDIS_BLOCKED;
7338 server.blpop_blocked_clients++;
7339 }
7340
7341 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7342 static void unblockClientWaitingData(redisClient *c) {
7343 dictEntry *de;
7344 list *l;
7345 int j;
7346
7347 assert(c->blockingkeys != NULL);
7348 /* The client may wait for multiple keys, so unblock it for every key. */
7349 for (j = 0; j < c->blockingkeysnum; j++) {
7350 /* Remove this client from the list of clients waiting for this key. */
7351 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7352 assert(de != NULL);
7353 l = dictGetEntryVal(de);
7354 listDelNode(l,listSearchKey(l,c));
7355 /* If the list is empty we need to remove it to avoid wasting memory */
7356 if (listLength(l) == 0)
7357 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7358 decrRefCount(c->blockingkeys[j]);
7359 }
7360 /* Cleanup the client structure */
7361 zfree(c->blockingkeys);
7362 c->blockingkeys = NULL;
7363 c->flags &= (~REDIS_BLOCKED);
7364 server.blpop_blocked_clients--;
7365 /* We want to process data if there is some command waiting
7366 * in the input buffer. Note that this is safe even if
7367 * unblockClientWaitingData() gets called from freeClient() because
7368 * freeClient() will be smart enough to call this function
7369 * *after* c->querybuf was set to NULL. */
7370 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7371 }
7372
7373 /* This should be called from any function PUSHing into lists.
7374 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7375 * 'ele' is the element pushed.
7376 *
7377 * If the function returns 0 there was no client waiting for a list push
7378 * against this key.
7379 *
7380 * If the function returns 1 there was a client waiting for a list push
7381 * against this key, the element was passed to this client thus it's not
7382 * needed to actually add it to the list and the caller should return asap. */
7383 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7384 struct dictEntry *de;
7385 redisClient *receiver;
7386 list *l;
7387 listNode *ln;
7388
7389 de = dictFind(c->db->blockingkeys,key);
7390 if (de == NULL) return 0;
7391 l = dictGetEntryVal(de);
7392 ln = listFirst(l);
7393 assert(ln != NULL);
7394 receiver = ln->value;
7395
7396 addReplySds(receiver,sdsnew("*2\r\n"));
7397 addReplyBulk(receiver,key);
7398 addReplyBulk(receiver,ele);
7399 unblockClientWaitingData(receiver);
7400 return 1;
7401 }
7402
7403 /* Blocking RPOP/LPOP */
7404 static void blockingPopGenericCommand(redisClient *c, int where) {
7405 robj *o;
7406 time_t timeout;
7407 int j;
7408
7409 for (j = 1; j < c->argc-1; j++) {
7410 o = lookupKeyWrite(c->db,c->argv[j]);
7411 if (o != NULL) {
7412 if (o->type != REDIS_LIST) {
7413 addReply(c,shared.wrongtypeerr);
7414 return;
7415 } else {
7416 list *list = o->ptr;
7417 if (listLength(list) != 0) {
7418 /* If the list contains elements fall back to the usual
7419 * non-blocking POP operation */
7420 robj *argv[2], **orig_argv;
7421 int orig_argc;
7422
7423 /* We need to alter the command arguments before to call
7424 * popGenericCommand() as the command takes a single key. */
7425 orig_argv = c->argv;
7426 orig_argc = c->argc;
7427 argv[1] = c->argv[j];
7428 c->argv = argv;
7429 c->argc = 2;
7430
7431 /* Also the return value is different, we need to output
7432 * the multi bulk reply header and the key name. The
7433 * "real" command will add the last element (the value)
7434 * for us. If this souds like an hack to you it's just
7435 * because it is... */
7436 addReplySds(c,sdsnew("*2\r\n"));
7437 addReplyBulk(c,argv[1]);
7438 popGenericCommand(c,where);
7439
7440 /* Fix the client structure with the original stuff */
7441 c->argv = orig_argv;
7442 c->argc = orig_argc;
7443 return;
7444 }
7445 }
7446 }
7447 }
7448 /* If the list is empty or the key does not exists we must block */
7449 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7450 if (timeout > 0) timeout += time(NULL);
7451 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7452 }
7453
7454 static void blpopCommand(redisClient *c) {
7455 blockingPopGenericCommand(c,REDIS_HEAD);
7456 }
7457
7458 static void brpopCommand(redisClient *c) {
7459 blockingPopGenericCommand(c,REDIS_TAIL);
7460 }
7461
7462 /* =============================== Replication ============================= */
7463
7464 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7465 ssize_t nwritten, ret = size;
7466 time_t start = time(NULL);
7467
7468 timeout++;
7469 while(size) {
7470 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7471 nwritten = write(fd,ptr,size);
7472 if (nwritten == -1) return -1;
7473 ptr += nwritten;
7474 size -= nwritten;
7475 }
7476 if ((time(NULL)-start) > timeout) {
7477 errno = ETIMEDOUT;
7478 return -1;
7479 }
7480 }
7481 return ret;
7482 }
7483
7484 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7485 ssize_t nread, totread = 0;
7486 time_t start = time(NULL);
7487
7488 timeout++;
7489 while(size) {
7490 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7491 nread = read(fd,ptr,size);
7492 if (nread == -1) return -1;
7493 ptr += nread;
7494 size -= nread;
7495 totread += nread;
7496 }
7497 if ((time(NULL)-start) > timeout) {
7498 errno = ETIMEDOUT;
7499 return -1;
7500 }
7501 }
7502 return totread;
7503 }
7504
7505 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7506 ssize_t nread = 0;
7507
7508 size--;
7509 while(size) {
7510 char c;
7511
7512 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7513 if (c == '\n') {
7514 *ptr = '\0';
7515 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7516 return nread;
7517 } else {
7518 *ptr++ = c;
7519 *ptr = '\0';
7520 nread++;
7521 }
7522 }
7523 return nread;
7524 }
7525
7526 static void syncCommand(redisClient *c) {
7527 /* ignore SYNC if aleady slave or in monitor mode */
7528 if (c->flags & REDIS_SLAVE) return;
7529
7530 /* SYNC can't be issued when the server has pending data to send to
7531 * the client about already issued commands. We need a fresh reply
7532 * buffer registering the differences between the BGSAVE and the current
7533 * dataset, so that we can copy to other slaves if needed. */
7534 if (listLength(c->reply) != 0) {
7535 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7536 return;
7537 }
7538
7539 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7540 /* Here we need to check if there is a background saving operation
7541 * in progress, or if it is required to start one */
7542 if (server.bgsavechildpid != -1) {
7543 /* Ok a background save is in progress. Let's check if it is a good
7544 * one for replication, i.e. if there is another slave that is
7545 * registering differences since the server forked to save */
7546 redisClient *slave;
7547 listNode *ln;
7548 listIter li;
7549
7550 listRewind(server.slaves,&li);
7551 while((ln = listNext(&li))) {
7552 slave = ln->value;
7553 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7554 }
7555 if (ln) {
7556 /* Perfect, the server is already registering differences for
7557 * another slave. Set the right state, and copy the buffer. */
7558 listRelease(c->reply);
7559 c->reply = listDup(slave->reply);
7560 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7561 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7562 } else {
7563 /* No way, we need to wait for the next BGSAVE in order to
7564 * register differences */
7565 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7566 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7567 }
7568 } else {
7569 /* Ok we don't have a BGSAVE in progress, let's start one */
7570 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7571 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7572 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7573 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7574 return;
7575 }
7576 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7577 }
7578 c->repldbfd = -1;
7579 c->flags |= REDIS_SLAVE;
7580 c->slaveseldb = 0;
7581 listAddNodeTail(server.slaves,c);
7582 return;
7583 }
7584
7585 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7586 redisClient *slave = privdata;
7587 REDIS_NOTUSED(el);
7588 REDIS_NOTUSED(mask);
7589 char buf[REDIS_IOBUF_LEN];
7590 ssize_t nwritten, buflen;
7591
7592 if (slave->repldboff == 0) {
7593 /* Write the bulk write count before to transfer the DB. In theory here
7594 * we don't know how much room there is in the output buffer of the
7595 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7596 * operations) will never be smaller than the few bytes we need. */
7597 sds bulkcount;
7598
7599 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7600 slave->repldbsize);
7601 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7602 {
7603 sdsfree(bulkcount);
7604 freeClient(slave);
7605 return;
7606 }
7607 sdsfree(bulkcount);
7608 }
7609 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7610 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7611 if (buflen <= 0) {
7612 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7613 (buflen == 0) ? "premature EOF" : strerror(errno));
7614 freeClient(slave);
7615 return;
7616 }
7617 if ((nwritten = write(fd,buf,buflen)) == -1) {
7618 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7619 strerror(errno));
7620 freeClient(slave);
7621 return;
7622 }
7623 slave->repldboff += nwritten;
7624 if (slave->repldboff == slave->repldbsize) {
7625 close(slave->repldbfd);
7626 slave->repldbfd = -1;
7627 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7628 slave->replstate = REDIS_REPL_ONLINE;
7629 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7630 sendReplyToClient, slave) == AE_ERR) {
7631 freeClient(slave);
7632 return;
7633 }
7634 addReplySds(slave,sdsempty());
7635 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7636 }
7637 }
7638
7639 /* This function is called at the end of every backgrond saving.
7640 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7641 * otherwise REDIS_ERR is passed to the function.
7642 *
7643 * The goal of this function is to handle slaves waiting for a successful
7644 * background saving in order to perform non-blocking synchronization. */
7645 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7646 listNode *ln;
7647 int startbgsave = 0;
7648 listIter li;
7649
7650 listRewind(server.slaves,&li);
7651 while((ln = listNext(&li))) {
7652 redisClient *slave = ln->value;
7653
7654 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7655 startbgsave = 1;
7656 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7657 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7658 struct redis_stat buf;
7659
7660 if (bgsaveerr != REDIS_OK) {
7661 freeClient(slave);
7662 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7663 continue;
7664 }
7665 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7666 redis_fstat(slave->repldbfd,&buf) == -1) {
7667 freeClient(slave);
7668 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7669 continue;
7670 }
7671 slave->repldboff = 0;
7672 slave->repldbsize = buf.st_size;
7673 slave->replstate = REDIS_REPL_SEND_BULK;
7674 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7675 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7676 freeClient(slave);
7677 continue;
7678 }
7679 }
7680 }
7681 if (startbgsave) {
7682 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7683 listIter li;
7684
7685 listRewind(server.slaves,&li);
7686 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7687 while((ln = listNext(&li))) {
7688 redisClient *slave = ln->value;
7689
7690 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7691 freeClient(slave);
7692 }
7693 }
7694 }
7695 }
7696
7697 static int syncWithMaster(void) {
7698 char buf[1024], tmpfile[256], authcmd[1024];
7699 long dumpsize;
7700 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7701 int dfd, maxtries = 5;
7702
7703 if (fd == -1) {
7704 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7705 strerror(errno));
7706 return REDIS_ERR;
7707 }
7708
7709 /* AUTH with the master if required. */
7710 if(server.masterauth) {
7711 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7712 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7713 close(fd);
7714 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7715 strerror(errno));
7716 return REDIS_ERR;
7717 }
7718 /* Read the AUTH result. */
7719 if (syncReadLine(fd,buf,1024,3600) == -1) {
7720 close(fd);
7721 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7722 strerror(errno));
7723 return REDIS_ERR;
7724 }
7725 if (buf[0] != '+') {
7726 close(fd);
7727 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7728 return REDIS_ERR;
7729 }
7730 }
7731
7732 /* Issue the SYNC command */
7733 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7734 close(fd);
7735 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7736 strerror(errno));
7737 return REDIS_ERR;
7738 }
7739 /* Read the bulk write count */
7740 if (syncReadLine(fd,buf,1024,3600) == -1) {
7741 close(fd);
7742 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7743 strerror(errno));
7744 return REDIS_ERR;
7745 }
7746 if (buf[0] != '$') {
7747 close(fd);
7748 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7749 return REDIS_ERR;
7750 }
7751 dumpsize = strtol(buf+1,NULL,10);
7752 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7753 /* Read the bulk write data on a temp file */
7754 while(maxtries--) {
7755 snprintf(tmpfile,256,
7756 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7757 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7758 if (dfd != -1) break;
7759 sleep(1);
7760 }
7761 if (dfd == -1) {
7762 close(fd);
7763 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7764 return REDIS_ERR;
7765 }
7766 while(dumpsize) {
7767 int nread, nwritten;
7768
7769 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7770 if (nread == -1) {
7771 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7772 strerror(errno));
7773 close(fd);
7774 close(dfd);
7775 return REDIS_ERR;
7776 }
7777 nwritten = write(dfd,buf,nread);
7778 if (nwritten == -1) {
7779 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7780 close(fd);
7781 close(dfd);
7782 return REDIS_ERR;
7783 }
7784 dumpsize -= nread;
7785 }
7786 close(dfd);
7787 if (rename(tmpfile,server.dbfilename) == -1) {
7788 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7789 unlink(tmpfile);
7790 close(fd);
7791 return REDIS_ERR;
7792 }
7793 emptyDb();
7794 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7795 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7796 close(fd);
7797 return REDIS_ERR;
7798 }
7799 server.master = createClient(fd);
7800 server.master->flags |= REDIS_MASTER;
7801 server.master->authenticated = 1;
7802 server.replstate = REDIS_REPL_CONNECTED;
7803 return REDIS_OK;
7804 }
7805
7806 static void slaveofCommand(redisClient *c) {
7807 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7808 !strcasecmp(c->argv[2]->ptr,"one")) {
7809 if (server.masterhost) {
7810 sdsfree(server.masterhost);
7811 server.masterhost = NULL;
7812 if (server.master) freeClient(server.master);
7813 server.replstate = REDIS_REPL_NONE;
7814 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7815 }
7816 } else {
7817 sdsfree(server.masterhost);
7818 server.masterhost = sdsdup(c->argv[1]->ptr);
7819 server.masterport = atoi(c->argv[2]->ptr);
7820 if (server.master) freeClient(server.master);
7821 server.replstate = REDIS_REPL_CONNECT;
7822 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7823 server.masterhost, server.masterport);
7824 }
7825 addReply(c,shared.ok);
7826 }
7827
7828 /* ============================ Maxmemory directive ======================== */
7829
7830 /* Try to free one object form the pre-allocated objects free list.
7831 * This is useful under low mem conditions as by default we take 1 million
7832 * free objects allocated. On success REDIS_OK is returned, otherwise
7833 * REDIS_ERR. */
7834 static int tryFreeOneObjectFromFreelist(void) {
7835 robj *o;
7836
7837 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7838 if (listLength(server.objfreelist)) {
7839 listNode *head = listFirst(server.objfreelist);
7840 o = listNodeValue(head);
7841 listDelNode(server.objfreelist,head);
7842 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7843 zfree(o);
7844 return REDIS_OK;
7845 } else {
7846 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7847 return REDIS_ERR;
7848 }
7849 }
7850
7851 /* This function gets called when 'maxmemory' is set on the config file to limit
7852 * the max memory used by the server, and we are out of memory.
7853 * This function will try to, in order:
7854 *
7855 * - Free objects from the free list
7856 * - Try to remove keys with an EXPIRE set
7857 *
7858 * It is not possible to free enough memory to reach used-memory < maxmemory
7859 * the server will start refusing commands that will enlarge even more the
7860 * memory usage.
7861 */
7862 static void freeMemoryIfNeeded(void) {
7863 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7864 int j, k, freed = 0;
7865
7866 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7867 for (j = 0; j < server.dbnum; j++) {
7868 int minttl = -1;
7869 robj *minkey = NULL;
7870 struct dictEntry *de;
7871
7872 if (dictSize(server.db[j].expires)) {
7873 freed = 1;
7874 /* From a sample of three keys drop the one nearest to
7875 * the natural expire */
7876 for (k = 0; k < 3; k++) {
7877 time_t t;
7878
7879 de = dictGetRandomKey(server.db[j].expires);
7880 t = (time_t) dictGetEntryVal(de);
7881 if (minttl == -1 || t < minttl) {
7882 minkey = dictGetEntryKey(de);
7883 minttl = t;
7884 }
7885 }
7886 deleteKey(server.db+j,minkey);
7887 }
7888 }
7889 if (!freed) return; /* nothing to free... */
7890 }
7891 }
7892
7893 /* ============================== Append Only file ========================== */
7894
7895 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7896 sds buf = sdsempty();
7897 int j;
7898 ssize_t nwritten;
7899 time_t now;
7900 robj *tmpargv[3];
7901
7902 /* The DB this command was targetting is not the same as the last command
7903 * we appendend. To issue a SELECT command is needed. */
7904 if (dictid != server.appendseldb) {
7905 char seldb[64];
7906
7907 snprintf(seldb,sizeof(seldb),"%d",dictid);
7908 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7909 (unsigned long)strlen(seldb),seldb);
7910 server.appendseldb = dictid;
7911 }
7912
7913 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7914 * EXPIREs into EXPIREATs calls */
7915 if (cmd->proc == expireCommand) {
7916 long when;
7917
7918 tmpargv[0] = createStringObject("EXPIREAT",8);
7919 tmpargv[1] = argv[1];
7920 incrRefCount(argv[1]);
7921 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7922 tmpargv[2] = createObject(REDIS_STRING,
7923 sdscatprintf(sdsempty(),"%ld",when));
7924 argv = tmpargv;
7925 }
7926
7927 /* Append the actual command */
7928 buf = sdscatprintf(buf,"*%d\r\n",argc);
7929 for (j = 0; j < argc; j++) {
7930 robj *o = argv[j];
7931
7932 o = getDecodedObject(o);
7933 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7934 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7935 buf = sdscatlen(buf,"\r\n",2);
7936 decrRefCount(o);
7937 }
7938
7939 /* Free the objects from the modified argv for EXPIREAT */
7940 if (cmd->proc == expireCommand) {
7941 for (j = 0; j < 3; j++)
7942 decrRefCount(argv[j]);
7943 }
7944
7945 /* We want to perform a single write. This should be guaranteed atomic
7946 * at least if the filesystem we are writing is a real physical one.
7947 * While this will save us against the server being killed I don't think
7948 * there is much to do about the whole server stopping for power problems
7949 * or alike */
7950 nwritten = write(server.appendfd,buf,sdslen(buf));
7951 if (nwritten != (signed)sdslen(buf)) {
7952 /* Ooops, we are in troubles. The best thing to do for now is
7953 * to simply exit instead to give the illusion that everything is
7954 * working as expected. */
7955 if (nwritten == -1) {
7956 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7957 } else {
7958 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7959 }
7960 exit(1);
7961 }
7962 /* If a background append only file rewriting is in progress we want to
7963 * accumulate the differences between the child DB and the current one
7964 * in a buffer, so that when the child process will do its work we
7965 * can append the differences to the new append only file. */
7966 if (server.bgrewritechildpid != -1)
7967 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7968
7969 sdsfree(buf);
7970 now = time(NULL);
7971 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7972 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7973 now-server.lastfsync > 1))
7974 {
7975 fsync(server.appendfd); /* Let's try to get this data on the disk */
7976 server.lastfsync = now;
7977 }
7978 }
7979
7980 /* In Redis commands are always executed in the context of a client, so in
7981 * order to load the append only file we need to create a fake client. */
7982 static struct redisClient *createFakeClient(void) {
7983 struct redisClient *c = zmalloc(sizeof(*c));
7984
7985 selectDb(c,0);
7986 c->fd = -1;
7987 c->querybuf = sdsempty();
7988 c->argc = 0;
7989 c->argv = NULL;
7990 c->flags = 0;
7991 /* We set the fake client as a slave waiting for the synchronization
7992 * so that Redis will not try to send replies to this client. */
7993 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7994 c->reply = listCreate();
7995 listSetFreeMethod(c->reply,decrRefCount);
7996 listSetDupMethod(c->reply,dupClientReplyValue);
7997 return c;
7998 }
7999
8000 static void freeFakeClient(struct redisClient *c) {
8001 sdsfree(c->querybuf);
8002 listRelease(c->reply);
8003 zfree(c);
8004 }
8005
8006 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
8007 * error (the append only file is zero-length) REDIS_ERR is returned. On
8008 * fatal error an error message is logged and the program exists. */
8009 int loadAppendOnlyFile(char *filename) {
8010 struct redisClient *fakeClient;
8011 FILE *fp = fopen(filename,"r");
8012 struct redis_stat sb;
8013 unsigned long long loadedkeys = 0;
8014
8015 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8016 return REDIS_ERR;
8017
8018 if (fp == NULL) {
8019 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8020 exit(1);
8021 }
8022
8023 fakeClient = createFakeClient();
8024 while(1) {
8025 int argc, j;
8026 unsigned long len;
8027 robj **argv;
8028 char buf[128];
8029 sds argsds;
8030 struct redisCommand *cmd;
8031
8032 if (fgets(buf,sizeof(buf),fp) == NULL) {
8033 if (feof(fp))
8034 break;
8035 else
8036 goto readerr;
8037 }
8038 if (buf[0] != '*') goto fmterr;
8039 argc = atoi(buf+1);
8040 argv = zmalloc(sizeof(robj*)*argc);
8041 for (j = 0; j < argc; j++) {
8042 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8043 if (buf[0] != '$') goto fmterr;
8044 len = strtol(buf+1,NULL,10);
8045 argsds = sdsnewlen(NULL,len);
8046 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
8047 argv[j] = createObject(REDIS_STRING,argsds);
8048 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8049 }
8050
8051 /* Command lookup */
8052 cmd = lookupCommand(argv[0]->ptr);
8053 if (!cmd) {
8054 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8055 exit(1);
8056 }
8057 /* Try object encoding */
8058 if (cmd->flags & REDIS_CMD_BULK)
8059 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
8060 /* Run the command in the context of a fake client */
8061 fakeClient->argc = argc;
8062 fakeClient->argv = argv;
8063 cmd->proc(fakeClient);
8064 /* Discard the reply objects list from the fake client */
8065 while(listLength(fakeClient->reply))
8066 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8067 /* Clean up, ready for the next command */
8068 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8069 zfree(argv);
8070 /* Handle swapping while loading big datasets when VM is on */
8071 loadedkeys++;
8072 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8073 while (zmalloc_used_memory() > server.vm_max_memory) {
8074 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
8075 }
8076 }
8077 }
8078 fclose(fp);
8079 freeFakeClient(fakeClient);
8080 return REDIS_OK;
8081
8082 readerr:
8083 if (feof(fp)) {
8084 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8085 } else {
8086 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8087 }
8088 exit(1);
8089 fmterr:
8090 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8091 exit(1);
8092 }
8093
8094 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
8095 static int fwriteBulkObject(FILE *fp, robj *obj) {
8096 char buf[128];
8097 int decrrc = 0;
8098
8099 /* Avoid the incr/decr ref count business if possible to help
8100 * copy-on-write (we are often in a child process when this function
8101 * is called).
8102 * Also makes sure that key objects don't get incrRefCount-ed when VM
8103 * is enabled */
8104 if (obj->encoding != REDIS_ENCODING_RAW) {
8105 obj = getDecodedObject(obj);
8106 decrrc = 1;
8107 }
8108 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8109 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8110 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8111 goto err;
8112 if (fwrite("\r\n",2,1,fp) == 0) goto err;
8113 if (decrrc) decrRefCount(obj);
8114 return 1;
8115 err:
8116 if (decrrc) decrRefCount(obj);
8117 return 0;
8118 }
8119
8120 /* Write binary-safe string into a file in the bulkformat
8121 * $<count>\r\n<payload>\r\n */
8122 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8123 char buf[128];
8124
8125 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8126 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8127 if (len && fwrite(s,len,1,fp) == 0) return 0;
8128 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8129 return 1;
8130 }
8131
8132 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8133 static int fwriteBulkDouble(FILE *fp, double d) {
8134 char buf[128], dbuf[128];
8135
8136 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8137 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8138 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8139 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8140 return 1;
8141 }
8142
8143 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8144 static int fwriteBulkLong(FILE *fp, long l) {
8145 char buf[128], lbuf[128];
8146
8147 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8148 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8149 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8150 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8151 return 1;
8152 }
8153
8154 /* Write a sequence of commands able to fully rebuild the dataset into
8155 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8156 static int rewriteAppendOnlyFile(char *filename) {
8157 dictIterator *di = NULL;
8158 dictEntry *de;
8159 FILE *fp;
8160 char tmpfile[256];
8161 int j;
8162 time_t now = time(NULL);
8163
8164 /* Note that we have to use a different temp name here compared to the
8165 * one used by rewriteAppendOnlyFileBackground() function. */
8166 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8167 fp = fopen(tmpfile,"w");
8168 if (!fp) {
8169 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8170 return REDIS_ERR;
8171 }
8172 for (j = 0; j < server.dbnum; j++) {
8173 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8174 redisDb *db = server.db+j;
8175 dict *d = db->dict;
8176 if (dictSize(d) == 0) continue;
8177 di = dictGetIterator(d);
8178 if (!di) {
8179 fclose(fp);
8180 return REDIS_ERR;
8181 }
8182
8183 /* SELECT the new DB */
8184 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
8185 if (fwriteBulkLong(fp,j) == 0) goto werr;
8186
8187 /* Iterate this DB writing every entry */
8188 while((de = dictNext(di)) != NULL) {
8189 robj *key, *o;
8190 time_t expiretime;
8191 int swapped;
8192
8193 key = dictGetEntryKey(de);
8194 /* If the value for this key is swapped, load a preview in memory.
8195 * We use a "swapped" flag to remember if we need to free the
8196 * value object instead to just increment the ref count anyway
8197 * in order to avoid copy-on-write of pages if we are forked() */
8198 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8199 key->storage == REDIS_VM_SWAPPING) {
8200 o = dictGetEntryVal(de);
8201 swapped = 0;
8202 } else {
8203 o = vmPreviewObject(key);
8204 swapped = 1;
8205 }
8206 expiretime = getExpire(db,key);
8207
8208 /* Save the key and associated value */
8209 if (o->type == REDIS_STRING) {
8210 /* Emit a SET command */
8211 char cmd[]="*3\r\n$3\r\nSET\r\n";
8212 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8213 /* Key and value */
8214 if (fwriteBulkObject(fp,key) == 0) goto werr;
8215 if (fwriteBulkObject(fp,o) == 0) goto werr;
8216 } else if (o->type == REDIS_LIST) {
8217 /* Emit the RPUSHes needed to rebuild the list */
8218 list *list = o->ptr;
8219 listNode *ln;
8220 listIter li;
8221
8222 listRewind(list,&li);
8223 while((ln = listNext(&li))) {
8224 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8225 robj *eleobj = listNodeValue(ln);
8226
8227 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8228 if (fwriteBulkObject(fp,key) == 0) goto werr;
8229 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8230 }
8231 } else if (o->type == REDIS_SET) {
8232 /* Emit the SADDs needed to rebuild the set */
8233 dict *set = o->ptr;
8234 dictIterator *di = dictGetIterator(set);
8235 dictEntry *de;
8236
8237 while((de = dictNext(di)) != NULL) {
8238 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8239 robj *eleobj = dictGetEntryKey(de);
8240
8241 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8242 if (fwriteBulkObject(fp,key) == 0) goto werr;
8243 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8244 }
8245 dictReleaseIterator(di);
8246 } else if (o->type == REDIS_ZSET) {
8247 /* Emit the ZADDs needed to rebuild the sorted set */
8248 zset *zs = o->ptr;
8249 dictIterator *di = dictGetIterator(zs->dict);
8250 dictEntry *de;
8251
8252 while((de = dictNext(di)) != NULL) {
8253 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8254 robj *eleobj = dictGetEntryKey(de);
8255 double *score = dictGetEntryVal(de);
8256
8257 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8258 if (fwriteBulkObject(fp,key) == 0) goto werr;
8259 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
8260 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8261 }
8262 dictReleaseIterator(di);
8263 } else if (o->type == REDIS_HASH) {
8264 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8265
8266 /* Emit the HSETs needed to rebuild the hash */
8267 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8268 unsigned char *p = zipmapRewind(o->ptr);
8269 unsigned char *field, *val;
8270 unsigned int flen, vlen;
8271
8272 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8273 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8274 if (fwriteBulkObject(fp,key) == 0) goto werr;
8275 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8276 return -1;
8277 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8278 return -1;
8279 }
8280 } else {
8281 dictIterator *di = dictGetIterator(o->ptr);
8282 dictEntry *de;
8283
8284 while((de = dictNext(di)) != NULL) {
8285 robj *field = dictGetEntryKey(de);
8286 robj *val = dictGetEntryVal(de);
8287
8288 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8289 if (fwriteBulkObject(fp,key) == 0) goto werr;
8290 if (fwriteBulkObject(fp,field) == -1) return -1;
8291 if (fwriteBulkObject(fp,val) == -1) return -1;
8292 }
8293 dictReleaseIterator(di);
8294 }
8295 } else {
8296 redisPanic("Unknown object type");
8297 }
8298 /* Save the expire time */
8299 if (expiretime != -1) {
8300 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8301 /* If this key is already expired skip it */
8302 if (expiretime < now) continue;
8303 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8304 if (fwriteBulkObject(fp,key) == 0) goto werr;
8305 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8306 }
8307 if (swapped) decrRefCount(o);
8308 }
8309 dictReleaseIterator(di);
8310 }
8311
8312 /* Make sure data will not remain on the OS's output buffers */
8313 fflush(fp);
8314 fsync(fileno(fp));
8315 fclose(fp);
8316
8317 /* Use RENAME to make sure the DB file is changed atomically only
8318 * if the generate DB file is ok. */
8319 if (rename(tmpfile,filename) == -1) {
8320 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8321 unlink(tmpfile);
8322 return REDIS_ERR;
8323 }
8324 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8325 return REDIS_OK;
8326
8327 werr:
8328 fclose(fp);
8329 unlink(tmpfile);
8330 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8331 if (di) dictReleaseIterator(di);
8332 return REDIS_ERR;
8333 }
8334
8335 /* This is how rewriting of the append only file in background works:
8336 *
8337 * 1) The user calls BGREWRITEAOF
8338 * 2) Redis calls this function, that forks():
8339 * 2a) the child rewrite the append only file in a temp file.
8340 * 2b) the parent accumulates differences in server.bgrewritebuf.
8341 * 3) When the child finished '2a' exists.
8342 * 4) The parent will trap the exit code, if it's OK, will append the
8343 * data accumulated into server.bgrewritebuf into the temp file, and
8344 * finally will rename(2) the temp file in the actual file name.
8345 * The the new file is reopened as the new append only file. Profit!
8346 */
8347 static int rewriteAppendOnlyFileBackground(void) {
8348 pid_t childpid;
8349
8350 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8351 if (server.vm_enabled) waitEmptyIOJobsQueue();
8352 if ((childpid = fork()) == 0) {
8353 /* Child */
8354 char tmpfile[256];
8355
8356 if (server.vm_enabled) vmReopenSwapFile();
8357 close(server.fd);
8358 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8359 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8360 _exit(0);
8361 } else {
8362 _exit(1);
8363 }
8364 } else {
8365 /* Parent */
8366 if (childpid == -1) {
8367 redisLog(REDIS_WARNING,
8368 "Can't rewrite append only file in background: fork: %s",
8369 strerror(errno));
8370 return REDIS_ERR;
8371 }
8372 redisLog(REDIS_NOTICE,
8373 "Background append only file rewriting started by pid %d",childpid);
8374 server.bgrewritechildpid = childpid;
8375 updateDictResizePolicy();
8376 /* We set appendseldb to -1 in order to force the next call to the
8377 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8378 * accumulated by the parent into server.bgrewritebuf will start
8379 * with a SELECT statement and it will be safe to merge. */
8380 server.appendseldb = -1;
8381 return REDIS_OK;
8382 }
8383 return REDIS_OK; /* unreached */
8384 }
8385
8386 static void bgrewriteaofCommand(redisClient *c) {
8387 if (server.bgrewritechildpid != -1) {
8388 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8389 return;
8390 }
8391 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8392 char *status = "+Background append only file rewriting started\r\n";
8393 addReplySds(c,sdsnew(status));
8394 } else {
8395 addReply(c,shared.err);
8396 }
8397 }
8398
8399 static void aofRemoveTempFile(pid_t childpid) {
8400 char tmpfile[256];
8401
8402 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8403 unlink(tmpfile);
8404 }
8405
8406 /* Virtual Memory is composed mainly of two subsystems:
8407 * - Blocking Virutal Memory
8408 * - Threaded Virtual Memory I/O
8409 * The two parts are not fully decoupled, but functions are split among two
8410 * different sections of the source code (delimited by comments) in order to
8411 * make more clear what functionality is about the blocking VM and what about
8412 * the threaded (not blocking) VM.
8413 *
8414 * Redis VM design:
8415 *
8416 * Redis VM is a blocking VM (one that blocks reading swapped values from
8417 * disk into memory when a value swapped out is needed in memory) that is made
8418 * unblocking by trying to examine the command argument vector in order to
8419 * load in background values that will likely be needed in order to exec
8420 * the command. The command is executed only once all the relevant keys
8421 * are loaded into memory.
8422 *
8423 * This basically is almost as simple of a blocking VM, but almost as parallel
8424 * as a fully non-blocking VM.
8425 */
8426
8427 /* =================== Virtual Memory - Blocking Side ====================== */
8428
8429 /* substitute the first occurrence of '%p' with the process pid in the
8430 * swap file name. */
8431 static void expandVmSwapFilename(void) {
8432 char *p = strstr(server.vm_swap_file,"%p");
8433 sds new;
8434
8435 if (!p) return;
8436 new = sdsempty();
8437 *p = '\0';
8438 new = sdscat(new,server.vm_swap_file);
8439 new = sdscatprintf(new,"%ld",(long) getpid());
8440 new = sdscat(new,p+2);
8441 zfree(server.vm_swap_file);
8442 server.vm_swap_file = new;
8443 }
8444
8445 static void vmInit(void) {
8446 off_t totsize;
8447 int pipefds[2];
8448 size_t stacksize;
8449
8450 if (server.vm_max_threads != 0)
8451 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8452
8453 expandVmSwapFilename();
8454 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8455 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8456 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8457 }
8458 if (server.vm_fp == NULL) {
8459 redisLog(REDIS_WARNING,
8460 "Impossible to open the swap file: %s. Exiting.",
8461 strerror(errno));
8462 exit(1);
8463 }
8464 server.vm_fd = fileno(server.vm_fp);
8465 server.vm_next_page = 0;
8466 server.vm_near_pages = 0;
8467 server.vm_stats_used_pages = 0;
8468 server.vm_stats_swapped_objects = 0;
8469 server.vm_stats_swapouts = 0;
8470 server.vm_stats_swapins = 0;
8471 totsize = server.vm_pages*server.vm_page_size;
8472 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8473 if (ftruncate(server.vm_fd,totsize) == -1) {
8474 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8475 strerror(errno));
8476 exit(1);
8477 } else {
8478 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8479 }
8480 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8481 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8482 (long long) (server.vm_pages+7)/8, server.vm_pages);
8483 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8484
8485 /* Initialize threaded I/O (used by Virtual Memory) */
8486 server.io_newjobs = listCreate();
8487 server.io_processing = listCreate();
8488 server.io_processed = listCreate();
8489 server.io_ready_clients = listCreate();
8490 pthread_mutex_init(&server.io_mutex,NULL);
8491 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8492 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8493 server.io_active_threads = 0;
8494 if (pipe(pipefds) == -1) {
8495 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8496 ,strerror(errno));
8497 exit(1);
8498 }
8499 server.io_ready_pipe_read = pipefds[0];
8500 server.io_ready_pipe_write = pipefds[1];
8501 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8502 /* LZF requires a lot of stack */
8503 pthread_attr_init(&server.io_threads_attr);
8504 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8505 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8506 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8507 /* Listen for events in the threaded I/O pipe */
8508 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8509 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8510 oom("creating file event");
8511 }
8512
8513 /* Mark the page as used */
8514 static void vmMarkPageUsed(off_t page) {
8515 off_t byte = page/8;
8516 int bit = page&7;
8517 redisAssert(vmFreePage(page) == 1);
8518 server.vm_bitmap[byte] |= 1<<bit;
8519 }
8520
8521 /* Mark N contiguous pages as used, with 'page' being the first. */
8522 static void vmMarkPagesUsed(off_t page, off_t count) {
8523 off_t j;
8524
8525 for (j = 0; j < count; j++)
8526 vmMarkPageUsed(page+j);
8527 server.vm_stats_used_pages += count;
8528 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8529 (long long)count, (long long)page);
8530 }
8531
8532 /* Mark the page as free */
8533 static void vmMarkPageFree(off_t page) {
8534 off_t byte = page/8;
8535 int bit = page&7;
8536 redisAssert(vmFreePage(page) == 0);
8537 server.vm_bitmap[byte] &= ~(1<<bit);
8538 }
8539
8540 /* Mark N contiguous pages as free, with 'page' being the first. */
8541 static void vmMarkPagesFree(off_t page, off_t count) {
8542 off_t j;
8543
8544 for (j = 0; j < count; j++)
8545 vmMarkPageFree(page+j);
8546 server.vm_stats_used_pages -= count;
8547 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8548 (long long)count, (long long)page);
8549 }
8550
8551 /* Test if the page is free */
8552 static int vmFreePage(off_t page) {
8553 off_t byte = page/8;
8554 int bit = page&7;
8555 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8556 }
8557
8558 /* Find N contiguous free pages storing the first page of the cluster in *first.
8559 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8560 * REDIS_ERR is returned.
8561 *
8562 * This function uses a simple algorithm: we try to allocate
8563 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8564 * again from the start of the swap file searching for free spaces.
8565 *
8566 * If it looks pretty clear that there are no free pages near our offset
8567 * we try to find less populated places doing a forward jump of
8568 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8569 * without hurry, and then we jump again and so forth...
8570 *
8571 * This function can be improved using a free list to avoid to guess
8572 * too much, since we could collect data about freed pages.
8573 *
8574 * note: I implemented this function just after watching an episode of
8575 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8576 */
8577 static int vmFindContiguousPages(off_t *first, off_t n) {
8578 off_t base, offset = 0, since_jump = 0, numfree = 0;
8579
8580 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8581 server.vm_near_pages = 0;
8582 server.vm_next_page = 0;
8583 }
8584 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8585 base = server.vm_next_page;
8586
8587 while(offset < server.vm_pages) {
8588 off_t this = base+offset;
8589
8590 /* If we overflow, restart from page zero */
8591 if (this >= server.vm_pages) {
8592 this -= server.vm_pages;
8593 if (this == 0) {
8594 /* Just overflowed, what we found on tail is no longer
8595 * interesting, as it's no longer contiguous. */
8596 numfree = 0;
8597 }
8598 }
8599 if (vmFreePage(this)) {
8600 /* This is a free page */
8601 numfree++;
8602 /* Already got N free pages? Return to the caller, with success */
8603 if (numfree == n) {
8604 *first = this-(n-1);
8605 server.vm_next_page = this+1;
8606 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8607 return REDIS_OK;
8608 }
8609 } else {
8610 /* The current one is not a free page */
8611 numfree = 0;
8612 }
8613
8614 /* Fast-forward if the current page is not free and we already
8615 * searched enough near this place. */
8616 since_jump++;
8617 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8618 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8619 since_jump = 0;
8620 /* Note that even if we rewind after the jump, we are don't need
8621 * to make sure numfree is set to zero as we only jump *if* it
8622 * is set to zero. */
8623 } else {
8624 /* Otherwise just check the next page */
8625 offset++;
8626 }
8627 }
8628 return REDIS_ERR;
8629 }
8630
8631 /* Write the specified object at the specified page of the swap file */
8632 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8633 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8634 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8635 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8636 redisLog(REDIS_WARNING,
8637 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8638 strerror(errno));
8639 return REDIS_ERR;
8640 }
8641 rdbSaveObject(server.vm_fp,o);
8642 fflush(server.vm_fp);
8643 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8644 return REDIS_OK;
8645 }
8646
8647 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8648 * needed to later retrieve the object into the key object.
8649 * If we can't find enough contiguous empty pages to swap the object on disk
8650 * REDIS_ERR is returned. */
8651 static int vmSwapObjectBlocking(robj *key, robj *val) {
8652 off_t pages = rdbSavedObjectPages(val,NULL);
8653 off_t page;
8654
8655 assert(key->storage == REDIS_VM_MEMORY);
8656 assert(key->refcount == 1);
8657 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8658 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8659 key->vm.page = page;
8660 key->vm.usedpages = pages;
8661 key->storage = REDIS_VM_SWAPPED;
8662 key->vtype = val->type;
8663 decrRefCount(val); /* Deallocate the object from memory. */
8664 vmMarkPagesUsed(page,pages);
8665 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8666 (unsigned char*) key->ptr,
8667 (unsigned long long) page, (unsigned long long) pages);
8668 server.vm_stats_swapped_objects++;
8669 server.vm_stats_swapouts++;
8670 return REDIS_OK;
8671 }
8672
8673 static robj *vmReadObjectFromSwap(off_t page, int type) {
8674 robj *o;
8675
8676 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8677 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8678 redisLog(REDIS_WARNING,
8679 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8680 strerror(errno));
8681 _exit(1);
8682 }
8683 o = rdbLoadObject(type,server.vm_fp);
8684 if (o == NULL) {
8685 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8686 _exit(1);
8687 }
8688 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8689 return o;
8690 }
8691
8692 /* Load the value object relative to the 'key' object from swap to memory.
8693 * The newly allocated object is returned.
8694 *
8695 * If preview is true the unserialized object is returned to the caller but
8696 * no changes are made to the key object, nor the pages are marked as freed */
8697 static robj *vmGenericLoadObject(robj *key, int preview) {
8698 robj *val;
8699
8700 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8701 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8702 if (!preview) {
8703 key->storage = REDIS_VM_MEMORY;
8704 key->vm.atime = server.unixtime;
8705 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8706 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8707 (unsigned char*) key->ptr);
8708 server.vm_stats_swapped_objects--;
8709 } else {
8710 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8711 (unsigned char*) key->ptr);
8712 }
8713 server.vm_stats_swapins++;
8714 return val;
8715 }
8716
8717 /* Plain object loading, from swap to memory */
8718 static robj *vmLoadObject(robj *key) {
8719 /* If we are loading the object in background, stop it, we
8720 * need to load this object synchronously ASAP. */
8721 if (key->storage == REDIS_VM_LOADING)
8722 vmCancelThreadedIOJob(key);
8723 return vmGenericLoadObject(key,0);
8724 }
8725
8726 /* Just load the value on disk, without to modify the key.
8727 * This is useful when we want to perform some operation on the value
8728 * without to really bring it from swap to memory, like while saving the
8729 * dataset or rewriting the append only log. */
8730 static robj *vmPreviewObject(robj *key) {
8731 return vmGenericLoadObject(key,1);
8732 }
8733
8734 /* How a good candidate is this object for swapping?
8735 * The better candidate it is, the greater the returned value.
8736 *
8737 * Currently we try to perform a fast estimation of the object size in
8738 * memory, and combine it with aging informations.
8739 *
8740 * Basically swappability = idle-time * log(estimated size)
8741 *
8742 * Bigger objects are preferred over smaller objects, but not
8743 * proportionally, this is why we use the logarithm. This algorithm is
8744 * just a first try and will probably be tuned later. */
8745 static double computeObjectSwappability(robj *o) {
8746 time_t age = server.unixtime - o->vm.atime;
8747 long asize = 0;
8748 list *l;
8749 dict *d;
8750 struct dictEntry *de;
8751 int z;
8752
8753 if (age <= 0) return 0;
8754 switch(o->type) {
8755 case REDIS_STRING:
8756 if (o->encoding != REDIS_ENCODING_RAW) {
8757 asize = sizeof(*o);
8758 } else {
8759 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8760 }
8761 break;
8762 case REDIS_LIST:
8763 l = o->ptr;
8764 listNode *ln = listFirst(l);
8765
8766 asize = sizeof(list);
8767 if (ln) {
8768 robj *ele = ln->value;
8769 long elesize;
8770
8771 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8772 (sizeof(*o)+sdslen(ele->ptr)) :
8773 sizeof(*o);
8774 asize += (sizeof(listNode)+elesize)*listLength(l);
8775 }
8776 break;
8777 case REDIS_SET:
8778 case REDIS_ZSET:
8779 z = (o->type == REDIS_ZSET);
8780 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8781
8782 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8783 if (z) asize += sizeof(zset)-sizeof(dict);
8784 if (dictSize(d)) {
8785 long elesize;
8786 robj *ele;
8787
8788 de = dictGetRandomKey(d);
8789 ele = dictGetEntryKey(de);
8790 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8791 (sizeof(*o)+sdslen(ele->ptr)) :
8792 sizeof(*o);
8793 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8794 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8795 }
8796 break;
8797 case REDIS_HASH:
8798 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8799 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8800 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8801 unsigned int klen, vlen;
8802 unsigned char *key, *val;
8803
8804 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8805 klen = 0;
8806 vlen = 0;
8807 }
8808 asize = len*(klen+vlen+3);
8809 } else if (o->encoding == REDIS_ENCODING_HT) {
8810 d = o->ptr;
8811 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8812 if (dictSize(d)) {
8813 long elesize;
8814 robj *ele;
8815
8816 de = dictGetRandomKey(d);
8817 ele = dictGetEntryKey(de);
8818 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8819 (sizeof(*o)+sdslen(ele->ptr)) :
8820 sizeof(*o);
8821 ele = dictGetEntryVal(de);
8822 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8823 (sizeof(*o)+sdslen(ele->ptr)) :
8824 sizeof(*o);
8825 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8826 }
8827 }
8828 break;
8829 }
8830 return (double)age*log(1+asize);
8831 }
8832
8833 /* Try to swap an object that's a good candidate for swapping.
8834 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8835 * to swap any object at all.
8836 *
8837 * If 'usethreaded' is true, Redis will try to swap the object in background
8838 * using I/O threads. */
8839 static int vmSwapOneObject(int usethreads) {
8840 int j, i;
8841 struct dictEntry *best = NULL;
8842 double best_swappability = 0;
8843 redisDb *best_db = NULL;
8844 robj *key, *val;
8845
8846 for (j = 0; j < server.dbnum; j++) {
8847 redisDb *db = server.db+j;
8848 /* Why maxtries is set to 100?
8849 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8850 * are swappable objects */
8851 int maxtries = 100;
8852
8853 if (dictSize(db->dict) == 0) continue;
8854 for (i = 0; i < 5; i++) {
8855 dictEntry *de;
8856 double swappability;
8857
8858 if (maxtries) maxtries--;
8859 de = dictGetRandomKey(db->dict);
8860 key = dictGetEntryKey(de);
8861 val = dictGetEntryVal(de);
8862 /* Only swap objects that are currently in memory.
8863 *
8864 * Also don't swap shared objects if threaded VM is on, as we
8865 * try to ensure that the main thread does not touch the
8866 * object while the I/O thread is using it, but we can't
8867 * control other keys without adding additional mutex. */
8868 if (key->storage != REDIS_VM_MEMORY ||
8869 (server.vm_max_threads != 0 && val->refcount != 1)) {
8870 if (maxtries) i--; /* don't count this try */
8871 continue;
8872 }
8873 swappability = computeObjectSwappability(val);
8874 if (!best || swappability > best_swappability) {
8875 best = de;
8876 best_swappability = swappability;
8877 best_db = db;
8878 }
8879 }
8880 }
8881 if (best == NULL) return REDIS_ERR;
8882 key = dictGetEntryKey(best);
8883 val = dictGetEntryVal(best);
8884
8885 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8886 key->ptr, best_swappability);
8887
8888 /* Unshare the key if needed */
8889 if (key->refcount > 1) {
8890 robj *newkey = dupStringObject(key);
8891 decrRefCount(key);
8892 key = dictGetEntryKey(best) = newkey;
8893 }
8894 /* Swap it */
8895 if (usethreads) {
8896 vmSwapObjectThreaded(key,val,best_db);
8897 return REDIS_OK;
8898 } else {
8899 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8900 dictGetEntryVal(best) = NULL;
8901 return REDIS_OK;
8902 } else {
8903 return REDIS_ERR;
8904 }
8905 }
8906 }
8907
8908 static int vmSwapOneObjectBlocking() {
8909 return vmSwapOneObject(0);
8910 }
8911
8912 static int vmSwapOneObjectThreaded() {
8913 return vmSwapOneObject(1);
8914 }
8915
8916 /* Return true if it's safe to swap out objects in a given moment.
8917 * Basically we don't want to swap objects out while there is a BGSAVE
8918 * or a BGAEOREWRITE running in backgroud. */
8919 static int vmCanSwapOut(void) {
8920 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8921 }
8922
8923 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8924 * and was deleted. Otherwise 0 is returned. */
8925 static int deleteIfSwapped(redisDb *db, robj *key) {
8926 dictEntry *de;
8927 robj *foundkey;
8928
8929 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8930 foundkey = dictGetEntryKey(de);
8931 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8932 deleteKey(db,key);
8933 return 1;
8934 }
8935
8936 /* =================== Virtual Memory - Threaded I/O ======================= */
8937
8938 static void freeIOJob(iojob *j) {
8939 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8940 j->type == REDIS_IOJOB_DO_SWAP ||
8941 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8942 decrRefCount(j->val);
8943 /* We don't decrRefCount the j->key field as we did't incremented
8944 * the count creating IO Jobs. This is because the key field here is
8945 * just used as an indentifier and if a key is removed the Job should
8946 * never be touched again. */
8947 zfree(j);
8948 }
8949
8950 /* Every time a thread finished a Job, it writes a byte into the write side
8951 * of an unix pipe in order to "awake" the main thread, and this function
8952 * is called. */
8953 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8954 int mask)
8955 {
8956 char buf[1];
8957 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8958 REDIS_NOTUSED(el);
8959 REDIS_NOTUSED(mask);
8960 REDIS_NOTUSED(privdata);
8961
8962 /* For every byte we read in the read side of the pipe, there is one
8963 * I/O job completed to process. */
8964 while((retval = read(fd,buf,1)) == 1) {
8965 iojob *j;
8966 listNode *ln;
8967 robj *key;
8968 struct dictEntry *de;
8969
8970 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8971
8972 /* Get the processed element (the oldest one) */
8973 lockThreadedIO();
8974 assert(listLength(server.io_processed) != 0);
8975 if (toprocess == -1) {
8976 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8977 if (toprocess <= 0) toprocess = 1;
8978 }
8979 ln = listFirst(server.io_processed);
8980 j = ln->value;
8981 listDelNode(server.io_processed,ln);
8982 unlockThreadedIO();
8983 /* If this job is marked as canceled, just ignore it */
8984 if (j->canceled) {
8985 freeIOJob(j);
8986 continue;
8987 }
8988 /* Post process it in the main thread, as there are things we
8989 * can do just here to avoid race conditions and/or invasive locks */
8990 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8991 de = dictFind(j->db->dict,j->key);
8992 assert(de != NULL);
8993 key = dictGetEntryKey(de);
8994 if (j->type == REDIS_IOJOB_LOAD) {
8995 redisDb *db;
8996
8997 /* Key loaded, bring it at home */
8998 key->storage = REDIS_VM_MEMORY;
8999 key->vm.atime = server.unixtime;
9000 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9001 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9002 (unsigned char*) key->ptr);
9003 server.vm_stats_swapped_objects--;
9004 server.vm_stats_swapins++;
9005 dictGetEntryVal(de) = j->val;
9006 incrRefCount(j->val);
9007 db = j->db;
9008 freeIOJob(j);
9009 /* Handle clients waiting for this key to be loaded. */
9010 handleClientsBlockedOnSwappedKey(db,key);
9011 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9012 /* Now we know the amount of pages required to swap this object.
9013 * Let's find some space for it, and queue this task again
9014 * rebranded as REDIS_IOJOB_DO_SWAP. */
9015 if (!vmCanSwapOut() ||
9016 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9017 {
9018 /* Ooops... no space or we can't swap as there is
9019 * a fork()ed Redis trying to save stuff on disk. */
9020 freeIOJob(j);
9021 key->storage = REDIS_VM_MEMORY; /* undo operation */
9022 } else {
9023 /* Note that we need to mark this pages as used now,
9024 * if the job will be canceled, we'll mark them as freed
9025 * again. */
9026 vmMarkPagesUsed(j->page,j->pages);
9027 j->type = REDIS_IOJOB_DO_SWAP;
9028 lockThreadedIO();
9029 queueIOJob(j);
9030 unlockThreadedIO();
9031 }
9032 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9033 robj *val;
9034
9035 /* Key swapped. We can finally free some memory. */
9036 if (key->storage != REDIS_VM_SWAPPING) {
9037 printf("key->storage: %d\n",key->storage);
9038 printf("key->name: %s\n",(char*)key->ptr);
9039 printf("key->refcount: %d\n",key->refcount);
9040 printf("val: %p\n",(void*)j->val);
9041 printf("val->type: %d\n",j->val->type);
9042 printf("val->ptr: %s\n",(char*)j->val->ptr);
9043 }
9044 redisAssert(key->storage == REDIS_VM_SWAPPING);
9045 val = dictGetEntryVal(de);
9046 key->vm.page = j->page;
9047 key->vm.usedpages = j->pages;
9048 key->storage = REDIS_VM_SWAPPED;
9049 key->vtype = j->val->type;
9050 decrRefCount(val); /* Deallocate the object from memory. */
9051 dictGetEntryVal(de) = NULL;
9052 redisLog(REDIS_DEBUG,
9053 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9054 (unsigned char*) key->ptr,
9055 (unsigned long long) j->page, (unsigned long long) j->pages);
9056 server.vm_stats_swapped_objects++;
9057 server.vm_stats_swapouts++;
9058 freeIOJob(j);
9059 /* Put a few more swap requests in queue if we are still
9060 * out of memory */
9061 if (trytoswap && vmCanSwapOut() &&
9062 zmalloc_used_memory() > server.vm_max_memory)
9063 {
9064 int more = 1;
9065 while(more) {
9066 lockThreadedIO();
9067 more = listLength(server.io_newjobs) <
9068 (unsigned) server.vm_max_threads;
9069 unlockThreadedIO();
9070 /* Don't waste CPU time if swappable objects are rare. */
9071 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9072 trytoswap = 0;
9073 break;
9074 }
9075 }
9076 }
9077 }
9078 processed++;
9079 if (processed == toprocess) return;
9080 }
9081 if (retval < 0 && errno != EAGAIN) {
9082 redisLog(REDIS_WARNING,
9083 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9084 strerror(errno));
9085 }
9086 }
9087
9088 static void lockThreadedIO(void) {
9089 pthread_mutex_lock(&server.io_mutex);
9090 }
9091
9092 static void unlockThreadedIO(void) {
9093 pthread_mutex_unlock(&server.io_mutex);
9094 }
9095
9096 /* Remove the specified object from the threaded I/O queue if still not
9097 * processed, otherwise make sure to flag it as canceled. */
9098 static void vmCancelThreadedIOJob(robj *o) {
9099 list *lists[3] = {
9100 server.io_newjobs, /* 0 */
9101 server.io_processing, /* 1 */
9102 server.io_processed /* 2 */
9103 };
9104 int i;
9105
9106 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9107 again:
9108 lockThreadedIO();
9109 /* Search for a matching key in one of the queues */
9110 for (i = 0; i < 3; i++) {
9111 listNode *ln;
9112 listIter li;
9113
9114 listRewind(lists[i],&li);
9115 while ((ln = listNext(&li)) != NULL) {
9116 iojob *job = ln->value;
9117
9118 if (job->canceled) continue; /* Skip this, already canceled. */
9119 if (job->key == o) {
9120 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9121 (void*)job, (char*)o->ptr, job->type, i);
9122 /* Mark the pages as free since the swap didn't happened
9123 * or happened but is now discarded. */
9124 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9125 vmMarkPagesFree(job->page,job->pages);
9126 /* Cancel the job. It depends on the list the job is
9127 * living in. */
9128 switch(i) {
9129 case 0: /* io_newjobs */
9130 /* If the job was yet not processed the best thing to do
9131 * is to remove it from the queue at all */
9132 freeIOJob(job);
9133 listDelNode(lists[i],ln);
9134 break;
9135 case 1: /* io_processing */
9136 /* Oh Shi- the thread is messing with the Job:
9137 *
9138 * Probably it's accessing the object if this is a
9139 * PREPARE_SWAP or DO_SWAP job.
9140 * If it's a LOAD job it may be reading from disk and
9141 * if we don't wait for the job to terminate before to
9142 * cancel it, maybe in a few microseconds data can be
9143 * corrupted in this pages. So the short story is:
9144 *
9145 * Better to wait for the job to move into the
9146 * next queue (processed)... */
9147
9148 /* We try again and again until the job is completed. */
9149 unlockThreadedIO();
9150 /* But let's wait some time for the I/O thread
9151 * to finish with this job. After all this condition
9152 * should be very rare. */
9153 usleep(1);
9154 goto again;
9155 case 2: /* io_processed */
9156 /* The job was already processed, that's easy...
9157 * just mark it as canceled so that we'll ignore it
9158 * when processing completed jobs. */
9159 job->canceled = 1;
9160 break;
9161 }
9162 /* Finally we have to adjust the storage type of the object
9163 * in order to "UNDO" the operaiton. */
9164 if (o->storage == REDIS_VM_LOADING)
9165 o->storage = REDIS_VM_SWAPPED;
9166 else if (o->storage == REDIS_VM_SWAPPING)
9167 o->storage = REDIS_VM_MEMORY;
9168 unlockThreadedIO();
9169 return;
9170 }
9171 }
9172 }
9173 unlockThreadedIO();
9174 assert(1 != 1); /* We should never reach this */
9175 }
9176
9177 static void *IOThreadEntryPoint(void *arg) {
9178 iojob *j;
9179 listNode *ln;
9180 REDIS_NOTUSED(arg);
9181
9182 pthread_detach(pthread_self());
9183 while(1) {
9184 /* Get a new job to process */
9185 lockThreadedIO();
9186 if (listLength(server.io_newjobs) == 0) {
9187 /* No new jobs in queue, exit. */
9188 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9189 (long) pthread_self());
9190 server.io_active_threads--;
9191 unlockThreadedIO();
9192 return NULL;
9193 }
9194 ln = listFirst(server.io_newjobs);
9195 j = ln->value;
9196 listDelNode(server.io_newjobs,ln);
9197 /* Add the job in the processing queue */
9198 j->thread = pthread_self();
9199 listAddNodeTail(server.io_processing,j);
9200 ln = listLast(server.io_processing); /* We use ln later to remove it */
9201 unlockThreadedIO();
9202 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9203 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9204
9205 /* Process the Job */
9206 if (j->type == REDIS_IOJOB_LOAD) {
9207 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9208 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9209 FILE *fp = fopen("/dev/null","w+");
9210 j->pages = rdbSavedObjectPages(j->val,fp);
9211 fclose(fp);
9212 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9213 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9214 j->canceled = 1;
9215 }
9216
9217 /* Done: insert the job into the processed queue */
9218 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9219 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9220 lockThreadedIO();
9221 listDelNode(server.io_processing,ln);
9222 listAddNodeTail(server.io_processed,j);
9223 unlockThreadedIO();
9224
9225 /* Signal the main thread there is new stuff to process */
9226 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9227 }
9228 return NULL; /* never reached */
9229 }
9230
9231 static void spawnIOThread(void) {
9232 pthread_t thread;
9233 sigset_t mask, omask;
9234 int err;
9235
9236 sigemptyset(&mask);
9237 sigaddset(&mask,SIGCHLD);
9238 sigaddset(&mask,SIGHUP);
9239 sigaddset(&mask,SIGPIPE);
9240 pthread_sigmask(SIG_SETMASK, &mask, &omask);
9241 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9242 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9243 strerror(err));
9244 usleep(1000000);
9245 }
9246 pthread_sigmask(SIG_SETMASK, &omask, NULL);
9247 server.io_active_threads++;
9248 }
9249
9250 /* We need to wait for the last thread to exit before we are able to
9251 * fork() in order to BGSAVE or BGREWRITEAOF. */
9252 static void waitEmptyIOJobsQueue(void) {
9253 while(1) {
9254 int io_processed_len;
9255
9256 lockThreadedIO();
9257 if (listLength(server.io_newjobs) == 0 &&
9258 listLength(server.io_processing) == 0 &&
9259 server.io_active_threads == 0)
9260 {
9261 unlockThreadedIO();
9262 return;
9263 }
9264 /* While waiting for empty jobs queue condition we post-process some
9265 * finshed job, as I/O threads may be hanging trying to write against
9266 * the io_ready_pipe_write FD but there are so much pending jobs that
9267 * it's blocking. */
9268 io_processed_len = listLength(server.io_processed);
9269 unlockThreadedIO();
9270 if (io_processed_len) {
9271 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9272 usleep(1000); /* 1 millisecond */
9273 } else {
9274 usleep(10000); /* 10 milliseconds */
9275 }
9276 }
9277 }
9278
9279 static void vmReopenSwapFile(void) {
9280 /* Note: we don't close the old one as we are in the child process
9281 * and don't want to mess at all with the original file object. */
9282 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9283 if (server.vm_fp == NULL) {
9284 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9285 server.vm_swap_file);
9286 _exit(1);
9287 }
9288 server.vm_fd = fileno(server.vm_fp);
9289 }
9290
9291 /* This function must be called while with threaded IO locked */
9292 static void queueIOJob(iojob *j) {
9293 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9294 (void*)j, j->type, (char*)j->key->ptr);
9295 listAddNodeTail(server.io_newjobs,j);
9296 if (server.io_active_threads < server.vm_max_threads)
9297 spawnIOThread();
9298 }
9299
9300 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9301 iojob *j;
9302
9303 assert(key->storage == REDIS_VM_MEMORY);
9304 assert(key->refcount == 1);
9305
9306 j = zmalloc(sizeof(*j));
9307 j->type = REDIS_IOJOB_PREPARE_SWAP;
9308 j->db = db;
9309 j->key = key;
9310 j->val = val;
9311 incrRefCount(val);
9312 j->canceled = 0;
9313 j->thread = (pthread_t) -1;
9314 key->storage = REDIS_VM_SWAPPING;
9315
9316 lockThreadedIO();
9317 queueIOJob(j);
9318 unlockThreadedIO();
9319 return REDIS_OK;
9320 }
9321
9322 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9323
9324 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9325 * If there is not already a job loading the key, it is craeted.
9326 * The key is added to the io_keys list in the client structure, and also
9327 * in the hash table mapping swapped keys to waiting clients, that is,
9328 * server.io_waited_keys. */
9329 static int waitForSwappedKey(redisClient *c, robj *key) {
9330 struct dictEntry *de;
9331 robj *o;
9332 list *l;
9333
9334 /* If the key does not exist or is already in RAM we don't need to
9335 * block the client at all. */
9336 de = dictFind(c->db->dict,key);
9337 if (de == NULL) return 0;
9338 o = dictGetEntryKey(de);
9339 if (o->storage == REDIS_VM_MEMORY) {
9340 return 0;
9341 } else if (o->storage == REDIS_VM_SWAPPING) {
9342 /* We were swapping the key, undo it! */
9343 vmCancelThreadedIOJob(o);
9344 return 0;
9345 }
9346
9347 /* OK: the key is either swapped, or being loaded just now. */
9348
9349 /* Add the key to the list of keys this client is waiting for.
9350 * This maps clients to keys they are waiting for. */
9351 listAddNodeTail(c->io_keys,key);
9352 incrRefCount(key);
9353
9354 /* Add the client to the swapped keys => clients waiting map. */
9355 de = dictFind(c->db->io_keys,key);
9356 if (de == NULL) {
9357 int retval;
9358
9359 /* For every key we take a list of clients blocked for it */
9360 l = listCreate();
9361 retval = dictAdd(c->db->io_keys,key,l);
9362 incrRefCount(key);
9363 assert(retval == DICT_OK);
9364 } else {
9365 l = dictGetEntryVal(de);
9366 }
9367 listAddNodeTail(l,c);
9368
9369 /* Are we already loading the key from disk? If not create a job */
9370 if (o->storage == REDIS_VM_SWAPPED) {
9371 iojob *j;
9372
9373 o->storage = REDIS_VM_LOADING;
9374 j = zmalloc(sizeof(*j));
9375 j->type = REDIS_IOJOB_LOAD;
9376 j->db = c->db;
9377 j->key = o;
9378 j->key->vtype = o->vtype;
9379 j->page = o->vm.page;
9380 j->val = NULL;
9381 j->canceled = 0;
9382 j->thread = (pthread_t) -1;
9383 lockThreadedIO();
9384 queueIOJob(j);
9385 unlockThreadedIO();
9386 }
9387 return 1;
9388 }
9389
9390 /* Preload keys needed for the ZUNION and ZINTER commands. */
9391 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9392 int i, num;
9393 num = atoi(c->argv[2]->ptr);
9394 for (i = 0; i < num; i++) {
9395 waitForSwappedKey(c,c->argv[3+i]);
9396 }
9397 }
9398
9399 /* Is this client attempting to run a command against swapped keys?
9400 * If so, block it ASAP, load the keys in background, then resume it.
9401 *
9402 * The important idea about this function is that it can fail! If keys will
9403 * still be swapped when the client is resumed, this key lookups will
9404 * just block loading keys from disk. In practical terms this should only
9405 * happen with SORT BY command or if there is a bug in this function.
9406 *
9407 * Return 1 if the client is marked as blocked, 0 if the client can
9408 * continue as the keys it is going to access appear to be in memory. */
9409 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9410 int j, last;
9411
9412 if (cmd->vm_preload_proc != NULL) {
9413 cmd->vm_preload_proc(c);
9414 } else {
9415 if (cmd->vm_firstkey == 0) return 0;
9416 last = cmd->vm_lastkey;
9417 if (last < 0) last = c->argc+last;
9418 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9419 waitForSwappedKey(c,c->argv[j]);
9420 }
9421
9422 /* If the client was blocked for at least one key, mark it as blocked. */
9423 if (listLength(c->io_keys)) {
9424 c->flags |= REDIS_IO_WAIT;
9425 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9426 server.vm_blocked_clients++;
9427 return 1;
9428 } else {
9429 return 0;
9430 }
9431 }
9432
9433 /* Remove the 'key' from the list of blocked keys for a given client.
9434 *
9435 * The function returns 1 when there are no longer blocking keys after
9436 * the current one was removed (and the client can be unblocked). */
9437 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9438 list *l;
9439 listNode *ln;
9440 listIter li;
9441 struct dictEntry *de;
9442
9443 /* Remove the key from the list of keys this client is waiting for. */
9444 listRewind(c->io_keys,&li);
9445 while ((ln = listNext(&li)) != NULL) {
9446 if (compareStringObjects(ln->value,key) == 0) {
9447 listDelNode(c->io_keys,ln);
9448 break;
9449 }
9450 }
9451 assert(ln != NULL);
9452
9453 /* Remove the client form the key => waiting clients map. */
9454 de = dictFind(c->db->io_keys,key);
9455 assert(de != NULL);
9456 l = dictGetEntryVal(de);
9457 ln = listSearchKey(l,c);
9458 assert(ln != NULL);
9459 listDelNode(l,ln);
9460 if (listLength(l) == 0)
9461 dictDelete(c->db->io_keys,key);
9462
9463 return listLength(c->io_keys) == 0;
9464 }
9465
9466 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9467 struct dictEntry *de;
9468 list *l;
9469 listNode *ln;
9470 int len;
9471
9472 de = dictFind(db->io_keys,key);
9473 if (!de) return;
9474
9475 l = dictGetEntryVal(de);
9476 len = listLength(l);
9477 /* Note: we can't use something like while(listLength(l)) as the list
9478 * can be freed by the calling function when we remove the last element. */
9479 while (len--) {
9480 ln = listFirst(l);
9481 redisClient *c = ln->value;
9482
9483 if (dontWaitForSwappedKey(c,key)) {
9484 /* Put the client in the list of clients ready to go as we
9485 * loaded all the keys about it. */
9486 listAddNodeTail(server.io_ready_clients,c);
9487 }
9488 }
9489 }
9490
9491 /* =========================== Remote Configuration ========================= */
9492
9493 static void configSetCommand(redisClient *c) {
9494 robj *o = getDecodedObject(c->argv[3]);
9495 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9496 zfree(server.dbfilename);
9497 server.dbfilename = zstrdup(o->ptr);
9498 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9499 zfree(server.requirepass);
9500 server.requirepass = zstrdup(o->ptr);
9501 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9502 zfree(server.masterauth);
9503 server.masterauth = zstrdup(o->ptr);
9504 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9505 server.maxmemory = strtoll(o->ptr, NULL, 10);
9506 } else {
9507 addReplySds(c,sdscatprintf(sdsempty(),
9508 "-ERR not supported CONFIG parameter %s\r\n",
9509 (char*)c->argv[2]->ptr));
9510 decrRefCount(o);
9511 return;
9512 }
9513 decrRefCount(o);
9514 addReply(c,shared.ok);
9515 }
9516
9517 static void configGetCommand(redisClient *c) {
9518 robj *o = getDecodedObject(c->argv[2]);
9519 robj *lenobj = createObject(REDIS_STRING,NULL);
9520 char *pattern = o->ptr;
9521 int matches = 0;
9522
9523 addReply(c,lenobj);
9524 decrRefCount(lenobj);
9525
9526 if (stringmatch(pattern,"dbfilename",0)) {
9527 addReplyBulkCString(c,"dbfilename");
9528 addReplyBulkCString(c,server.dbfilename);
9529 matches++;
9530 }
9531 if (stringmatch(pattern,"requirepass",0)) {
9532 addReplyBulkCString(c,"requirepass");
9533 addReplyBulkCString(c,server.requirepass);
9534 matches++;
9535 }
9536 if (stringmatch(pattern,"masterauth",0)) {
9537 addReplyBulkCString(c,"masterauth");
9538 addReplyBulkCString(c,server.masterauth);
9539 matches++;
9540 }
9541 if (stringmatch(pattern,"maxmemory",0)) {
9542 char buf[128];
9543
9544 snprintf(buf,128,"%llu\n",server.maxmemory);
9545 addReplyBulkCString(c,"maxmemory");
9546 addReplyBulkCString(c,buf);
9547 matches++;
9548 }
9549 decrRefCount(o);
9550 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9551 }
9552
9553 static void configCommand(redisClient *c) {
9554 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9555 if (c->argc != 4) goto badarity;
9556 configSetCommand(c);
9557 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9558 if (c->argc != 3) goto badarity;
9559 configGetCommand(c);
9560 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9561 if (c->argc != 2) goto badarity;
9562 server.stat_numcommands = 0;
9563 server.stat_numconnections = 0;
9564 server.stat_expiredkeys = 0;
9565 server.stat_starttime = time(NULL);
9566 addReply(c,shared.ok);
9567 } else {
9568 addReplySds(c,sdscatprintf(sdsempty(),
9569 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9570 }
9571 return;
9572
9573 badarity:
9574 addReplySds(c,sdscatprintf(sdsempty(),
9575 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9576 (char*) c->argv[1]->ptr));
9577 }
9578
9579 /* =========================== Pubsub implementation ======================== */
9580
9581 static void freePubsubPattern(void *p) {
9582 pubsubPattern *pat = p;
9583
9584 decrRefCount(pat->pattern);
9585 zfree(pat);
9586 }
9587
9588 static int listMatchPubsubPattern(void *a, void *b) {
9589 pubsubPattern *pa = a, *pb = b;
9590
9591 return (pa->client == pb->client) &&
9592 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9593 }
9594
9595 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9596 * 0 if the client was already subscribed to that channel. */
9597 static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
9598 struct dictEntry *de;
9599 list *clients = NULL;
9600 int retval = 0;
9601
9602 /* Add the channel to the client -> channels hash table */
9603 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
9604 retval = 1;
9605 incrRefCount(channel);
9606 /* Add the client to the channel -> list of clients hash table */
9607 de = dictFind(server.pubsub_channels,channel);
9608 if (de == NULL) {
9609 clients = listCreate();
9610 dictAdd(server.pubsub_channels,channel,clients);
9611 incrRefCount(channel);
9612 } else {
9613 clients = dictGetEntryVal(de);
9614 }
9615 listAddNodeTail(clients,c);
9616 }
9617 /* Notify the client */
9618 addReply(c,shared.mbulk3);
9619 addReply(c,shared.subscribebulk);
9620 addReplyBulk(c,channel);
9621 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9622 return retval;
9623 }
9624
9625 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9626 * 0 if the client was not subscribed to the specified channel. */
9627 static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
9628 struct dictEntry *de;
9629 list *clients;
9630 listNode *ln;
9631 int retval = 0;
9632
9633 /* Remove the channel from the client -> channels hash table */
9634 incrRefCount(channel); /* channel may be just a pointer to the same object
9635 we have in the hash tables. Protect it... */
9636 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
9637 retval = 1;
9638 /* Remove the client from the channel -> clients list hash table */
9639 de = dictFind(server.pubsub_channels,channel);
9640 assert(de != NULL);
9641 clients = dictGetEntryVal(de);
9642 ln = listSearchKey(clients,c);
9643 assert(ln != NULL);
9644 listDelNode(clients,ln);
9645 if (listLength(clients) == 0) {
9646 /* Free the list and associated hash entry at all if this was
9647 * the latest client, so that it will be possible to abuse
9648 * Redis PUBSUB creating millions of channels. */
9649 dictDelete(server.pubsub_channels,channel);
9650 }
9651 }
9652 /* Notify the client */
9653 if (notify) {
9654 addReply(c,shared.mbulk3);
9655 addReply(c,shared.unsubscribebulk);
9656 addReplyBulk(c,channel);
9657 addReplyLong(c,dictSize(c->pubsub_channels)+
9658 listLength(c->pubsub_patterns));
9659
9660 }
9661 decrRefCount(channel); /* it is finally safe to release it */
9662 return retval;
9663 }
9664
9665 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9666 static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9667 int retval = 0;
9668
9669 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9670 retval = 1;
9671 pubsubPattern *pat;
9672 listAddNodeTail(c->pubsub_patterns,pattern);
9673 incrRefCount(pattern);
9674 pat = zmalloc(sizeof(*pat));
9675 pat->pattern = getDecodedObject(pattern);
9676 pat->client = c;
9677 listAddNodeTail(server.pubsub_patterns,pat);
9678 }
9679 /* Notify the client */
9680 addReply(c,shared.mbulk3);
9681 addReply(c,shared.psubscribebulk);
9682 addReplyBulk(c,pattern);
9683 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9684 return retval;
9685 }
9686
9687 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9688 * 0 if the client was not subscribed to the specified channel. */
9689 static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9690 listNode *ln;
9691 pubsubPattern pat;
9692 int retval = 0;
9693
9694 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9695 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9696 retval = 1;
9697 listDelNode(c->pubsub_patterns,ln);
9698 pat.client = c;
9699 pat.pattern = pattern;
9700 ln = listSearchKey(server.pubsub_patterns,&pat);
9701 listDelNode(server.pubsub_patterns,ln);
9702 }
9703 /* Notify the client */
9704 if (notify) {
9705 addReply(c,shared.mbulk3);
9706 addReply(c,shared.punsubscribebulk);
9707 addReplyBulk(c,pattern);
9708 addReplyLong(c,dictSize(c->pubsub_channels)+
9709 listLength(c->pubsub_patterns));
9710 }
9711 decrRefCount(pattern);
9712 return retval;
9713 }
9714
9715 /* Unsubscribe from all the channels. Return the number of channels the
9716 * client was subscribed from. */
9717 static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9718 dictIterator *di = dictGetIterator(c->pubsub_channels);
9719 dictEntry *de;
9720 int count = 0;
9721
9722 while((de = dictNext(di)) != NULL) {
9723 robj *channel = dictGetEntryKey(de);
9724
9725 count += pubsubUnsubscribeChannel(c,channel,notify);
9726 }
9727 dictReleaseIterator(di);
9728 return count;
9729 }
9730
9731 /* Unsubscribe from all the patterns. Return the number of patterns the
9732 * client was subscribed from. */
9733 static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9734 listNode *ln;
9735 listIter li;
9736 int count = 0;
9737
9738 listRewind(c->pubsub_patterns,&li);
9739 while ((ln = listNext(&li)) != NULL) {
9740 robj *pattern = ln->value;
9741
9742 count += pubsubUnsubscribePattern(c,pattern,notify);
9743 }
9744 return count;
9745 }
9746
9747 /* Publish a message */
9748 static int pubsubPublishMessage(robj *channel, robj *message) {
9749 int receivers = 0;
9750 struct dictEntry *de;
9751 listNode *ln;
9752 listIter li;
9753
9754 /* Send to clients listening for that channel */
9755 de = dictFind(server.pubsub_channels,channel);
9756 if (de) {
9757 list *list = dictGetEntryVal(de);
9758 listNode *ln;
9759 listIter li;
9760
9761 listRewind(list,&li);
9762 while ((ln = listNext(&li)) != NULL) {
9763 redisClient *c = ln->value;
9764
9765 addReply(c,shared.mbulk3);
9766 addReply(c,shared.messagebulk);
9767 addReplyBulk(c,channel);
9768 addReplyBulk(c,message);
9769 receivers++;
9770 }
9771 }
9772 /* Send to clients listening to matching channels */
9773 if (listLength(server.pubsub_patterns)) {
9774 listRewind(server.pubsub_patterns,&li);
9775 channel = getDecodedObject(channel);
9776 while ((ln = listNext(&li)) != NULL) {
9777 pubsubPattern *pat = ln->value;
9778
9779 if (stringmatchlen((char*)pat->pattern->ptr,
9780 sdslen(pat->pattern->ptr),
9781 (char*)channel->ptr,
9782 sdslen(channel->ptr),0)) {
9783 addReply(pat->client,shared.mbulk3);
9784 addReply(pat->client,shared.messagebulk);
9785 addReplyBulk(pat->client,channel);
9786 addReplyBulk(pat->client,message);
9787 receivers++;
9788 }
9789 }
9790 decrRefCount(channel);
9791 }
9792 return receivers;
9793 }
9794
9795 static void subscribeCommand(redisClient *c) {
9796 int j;
9797
9798 for (j = 1; j < c->argc; j++)
9799 pubsubSubscribeChannel(c,c->argv[j]);
9800 }
9801
9802 static void unsubscribeCommand(redisClient *c) {
9803 if (c->argc == 1) {
9804 pubsubUnsubscribeAllChannels(c,1);
9805 return;
9806 } else {
9807 int j;
9808
9809 for (j = 1; j < c->argc; j++)
9810 pubsubUnsubscribeChannel(c,c->argv[j],1);
9811 }
9812 }
9813
9814 static void psubscribeCommand(redisClient *c) {
9815 int j;
9816
9817 for (j = 1; j < c->argc; j++)
9818 pubsubSubscribePattern(c,c->argv[j]);
9819 }
9820
9821 static void punsubscribeCommand(redisClient *c) {
9822 if (c->argc == 1) {
9823 pubsubUnsubscribeAllPatterns(c,1);
9824 return;
9825 } else {
9826 int j;
9827
9828 for (j = 1; j < c->argc; j++)
9829 pubsubUnsubscribePattern(c,c->argv[j],1);
9830 }
9831 }
9832
9833 static void publishCommand(redisClient *c) {
9834 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9835 addReplyLong(c,receivers);
9836 }
9837
9838 /* ================================= Debugging ============================== */
9839
9840 static void debugCommand(redisClient *c) {
9841 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9842 *((char*)-1) = 'x';
9843 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9844 if (rdbSave(server.dbfilename) != REDIS_OK) {
9845 addReply(c,shared.err);
9846 return;
9847 }
9848 emptyDb();
9849 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9850 addReply(c,shared.err);
9851 return;
9852 }
9853 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9854 addReply(c,shared.ok);
9855 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9856 emptyDb();
9857 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9858 addReply(c,shared.err);
9859 return;
9860 }
9861 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9862 addReply(c,shared.ok);
9863 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9864 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9865 robj *key, *val;
9866
9867 if (!de) {
9868 addReply(c,shared.nokeyerr);
9869 return;
9870 }
9871 key = dictGetEntryKey(de);
9872 val = dictGetEntryVal(de);
9873 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9874 key->storage == REDIS_VM_SWAPPING)) {
9875 char *strenc;
9876 char buf[128];
9877
9878 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9879 strenc = strencoding[val->encoding];
9880 } else {
9881 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9882 strenc = buf;
9883 }
9884 addReplySds(c,sdscatprintf(sdsempty(),
9885 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9886 "encoding:%s serializedlength:%lld\r\n",
9887 (void*)key, key->refcount, (void*)val, val->refcount,
9888 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9889 } else {
9890 addReplySds(c,sdscatprintf(sdsempty(),
9891 "+Key at:%p refcount:%d, value swapped at: page %llu "
9892 "using %llu pages\r\n",
9893 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9894 (unsigned long long) key->vm.usedpages));
9895 }
9896 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9897 lookupKeyRead(c->db,c->argv[2]);
9898 addReply(c,shared.ok);
9899 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9900 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9901 robj *key, *val;
9902
9903 if (!server.vm_enabled) {
9904 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9905 return;
9906 }
9907 if (!de) {
9908 addReply(c,shared.nokeyerr);
9909 return;
9910 }
9911 key = dictGetEntryKey(de);
9912 val = dictGetEntryVal(de);
9913 /* If the key is shared we want to create a copy */
9914 if (key->refcount > 1) {
9915 robj *newkey = dupStringObject(key);
9916 decrRefCount(key);
9917 key = dictGetEntryKey(de) = newkey;
9918 }
9919 /* Swap it */
9920 if (key->storage != REDIS_VM_MEMORY) {
9921 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9922 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9923 dictGetEntryVal(de) = NULL;
9924 addReply(c,shared.ok);
9925 } else {
9926 addReply(c,shared.err);
9927 }
9928 } else {
9929 addReplySds(c,sdsnew(
9930 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9931 }
9932 }
9933
9934 static void _redisAssert(char *estr, char *file, int line) {
9935 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9936 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9937 #ifdef HAVE_BACKTRACE
9938 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9939 *((char*)-1) = 'x';
9940 #endif
9941 }
9942
9943 static void _redisPanic(char *msg, char *file, int line) {
9944 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
9945 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
9946 #ifdef HAVE_BACKTRACE
9947 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9948 *((char*)-1) = 'x';
9949 #endif
9950 }
9951
9952 /* =================================== Main! ================================ */
9953
9954 #ifdef __linux__
9955 int linuxOvercommitMemoryValue(void) {
9956 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9957 char buf[64];
9958
9959 if (!fp) return -1;
9960 if (fgets(buf,64,fp) == NULL) {
9961 fclose(fp);
9962 return -1;
9963 }
9964 fclose(fp);
9965
9966 return atoi(buf);
9967 }
9968
9969 void linuxOvercommitMemoryWarning(void) {
9970 if (linuxOvercommitMemoryValue() == 0) {
9971 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9972 }
9973 }
9974 #endif /* __linux__ */
9975
9976 static void daemonize(void) {
9977 int fd;
9978 FILE *fp;
9979
9980 if (fork() != 0) exit(0); /* parent exits */
9981 setsid(); /* create a new session */
9982
9983 /* Every output goes to /dev/null. If Redis is daemonized but
9984 * the 'logfile' is set to 'stdout' in the configuration file
9985 * it will not log at all. */
9986 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9987 dup2(fd, STDIN_FILENO);
9988 dup2(fd, STDOUT_FILENO);
9989 dup2(fd, STDERR_FILENO);
9990 if (fd > STDERR_FILENO) close(fd);
9991 }
9992 /* Try to write the pid file */
9993 fp = fopen(server.pidfile,"w");
9994 if (fp) {
9995 fprintf(fp,"%d\n",getpid());
9996 fclose(fp);
9997 }
9998 }
9999
10000 static void version() {
10001 printf("Redis server version %s\n", REDIS_VERSION);
10002 exit(0);
10003 }
10004
10005 static void usage() {
10006 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
10007 fprintf(stderr," ./redis-server - (read config from stdin)\n");
10008 exit(1);
10009 }
10010
10011 int main(int argc, char **argv) {
10012 time_t start;
10013
10014 initServerConfig();
10015 if (argc == 2) {
10016 if (strcmp(argv[1], "-v") == 0 ||
10017 strcmp(argv[1], "--version") == 0) version();
10018 if (strcmp(argv[1], "--help") == 0) usage();
10019 resetServerSaveParams();
10020 loadServerConfig(argv[1]);
10021 } else if ((argc > 2)) {
10022 usage();
10023 } else {
10024 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10025 }
10026 if (server.daemonize) daemonize();
10027 initServer();
10028 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10029 #ifdef __linux__
10030 linuxOvercommitMemoryWarning();
10031 #endif
10032 start = time(NULL);
10033 if (server.appendonly) {
10034 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
10035 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
10036 } else {
10037 if (rdbLoad(server.dbfilename) == REDIS_OK)
10038 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
10039 }
10040 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
10041 aeSetBeforeSleepProc(server.el,beforeSleep);
10042 aeMain(server.el);
10043 aeDeleteEventLoop(server.el);
10044 return 0;
10045 }
10046
10047 /* ============================= Backtrace support ========================= */
10048
10049 #ifdef HAVE_BACKTRACE
10050 static char *findFuncName(void *pointer, unsigned long *offset);
10051
10052 static void *getMcontextEip(ucontext_t *uc) {
10053 #if defined(__FreeBSD__)
10054 return (void*) uc->uc_mcontext.mc_eip;
10055 #elif defined(__dietlibc__)
10056 return (void*) uc->uc_mcontext.eip;
10057 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
10058 #if __x86_64__
10059 return (void*) uc->uc_mcontext->__ss.__rip;
10060 #else
10061 return (void*) uc->uc_mcontext->__ss.__eip;
10062 #endif
10063 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
10064 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
10065 return (void*) uc->uc_mcontext->__ss.__rip;
10066 #else
10067 return (void*) uc->uc_mcontext->__ss.__eip;
10068 #endif
10069 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
10070 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
10071 #elif defined(__ia64__) /* Linux IA64 */
10072 return (void*) uc->uc_mcontext.sc_ip;
10073 #else
10074 return NULL;
10075 #endif
10076 }
10077
10078 static void segvHandler(int sig, siginfo_t *info, void *secret) {
10079 void *trace[100];
10080 char **messages = NULL;
10081 int i, trace_size = 0;
10082 unsigned long offset=0;
10083 ucontext_t *uc = (ucontext_t*) secret;
10084 sds infostring;
10085 REDIS_NOTUSED(info);
10086
10087 redisLog(REDIS_WARNING,
10088 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
10089 infostring = genRedisInfoString();
10090 redisLog(REDIS_WARNING, "%s",infostring);
10091 /* It's not safe to sdsfree() the returned string under memory
10092 * corruption conditions. Let it leak as we are going to abort */
10093
10094 trace_size = backtrace(trace, 100);
10095 /* overwrite sigaction with caller's address */
10096 if (getMcontextEip(uc) != NULL) {
10097 trace[1] = getMcontextEip(uc);
10098 }
10099 messages = backtrace_symbols(trace, trace_size);
10100
10101 for (i=1; i<trace_size; ++i) {
10102 char *fn = findFuncName(trace[i], &offset), *p;
10103
10104 p = strchr(messages[i],'+');
10105 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10106 redisLog(REDIS_WARNING,"%s", messages[i]);
10107 } else {
10108 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10109 }
10110 }
10111 /* free(messages); Don't call free() with possibly corrupted memory. */
10112 _exit(0);
10113 }
10114
10115 static void setupSigSegvAction(void) {
10116 struct sigaction act;
10117
10118 sigemptyset (&act.sa_mask);
10119 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10120 * is used. Otherwise, sa_handler is used */
10121 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10122 act.sa_sigaction = segvHandler;
10123 sigaction (SIGSEGV, &act, NULL);
10124 sigaction (SIGBUS, &act, NULL);
10125 sigaction (SIGFPE, &act, NULL);
10126 sigaction (SIGILL, &act, NULL);
10127 sigaction (SIGBUS, &act, NULL);
10128 return;
10129 }
10130
10131 #include "staticsymbols.h"
10132 /* This function try to convert a pointer into a function name. It's used in
10133 * oreder to provide a backtrace under segmentation fault that's able to
10134 * display functions declared as static (otherwise the backtrace is useless). */
10135 static char *findFuncName(void *pointer, unsigned long *offset){
10136 int i, ret = -1;
10137 unsigned long off, minoff = 0;
10138
10139 /* Try to match against the Symbol with the smallest offset */
10140 for (i=0; symsTable[i].pointer; i++) {
10141 unsigned long lp = (unsigned long) pointer;
10142
10143 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10144 off=lp-symsTable[i].pointer;
10145 if (ret < 0 || off < minoff) {
10146 minoff=off;
10147 ret=i;
10148 }
10149 }
10150 }
10151 if (ret == -1) return NULL;
10152 *offset = minoff;
10153 return symsTable[ret].name;
10154 }
10155 #else /* HAVE_BACKTRACE */
10156 static void setupSigSegvAction(void) {
10157 }
10158 #endif /* HAVE_BACKTRACE */
10159
10160
10161
10162 /* The End */
10163
10164
10165