]> git.saurik.com Git - redis.git/blob - redis.c
44db9d683681914715900e4b3641e0d9f4eda751
[redis.git] / redis.c
1 /*
2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #define REDIS_VERSION "1.3.8"
31
32 #include "fmacros.h"
33 #include "config.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #define __USE_POSIX199309
41 #define __USE_UNIX98
42 #include <signal.h>
43
44 #ifdef HAVE_BACKTRACE
45 #include <execinfo.h>
46 #include <ucontext.h>
47 #endif /* HAVE_BACKTRACE */
48
49 #include <sys/wait.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <ctype.h>
53 #include <stdarg.h>
54 #include <inttypes.h>
55 #include <arpa/inet.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/time.h>
59 #include <sys/resource.h>
60 #include <sys/uio.h>
61 #include <limits.h>
62 #include <math.h>
63 #include <pthread.h>
64
65 #if defined(__sun)
66 #include "solarisfixes.h"
67 #endif
68
69 #include "redis.h"
70 #include "ae.h" /* Event driven programming library */
71 #include "sds.h" /* Dynamic safe strings */
72 #include "anet.h" /* Networking the easy way */
73 #include "dict.h" /* Hash tables */
74 #include "adlist.h" /* Linked lists */
75 #include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 #include "lzf.h" /* LZF compression library */
77 #include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78 #include "zipmap.h"
79
80 /* Error codes */
81 #define REDIS_OK 0
82 #define REDIS_ERR -1
83
84 /* Static server configuration */
85 #define REDIS_SERVERPORT 6379 /* TCP port */
86 #define REDIS_MAXIDLETIME (60*5) /* default client timeout */
87 #define REDIS_IOBUF_LEN 1024
88 #define REDIS_LOADBUF_LEN 1024
89 #define REDIS_STATIC_ARGS 8
90 #define REDIS_DEFAULT_DBNUM 16
91 #define REDIS_CONFIGLINE_MAX 1024
92 #define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93 #define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94 #define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
95 #define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 #define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98 /* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99 #define REDIS_WRITEV_THRESHOLD 3
100 /* Max number of iovecs used for each writev call */
101 #define REDIS_WRITEV_IOVEC_COUNT 256
102
103 /* Hash table parameters */
104 #define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
105
106 /* Command flags */
107 #define REDIS_CMD_BULK 1 /* Bulk write command */
108 #define REDIS_CMD_INLINE 2 /* Inline command */
109 /* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113 #define REDIS_CMD_DENYOOM 4
114
115 /* Object types */
116 #define REDIS_STRING 0
117 #define REDIS_LIST 1
118 #define REDIS_SET 2
119 #define REDIS_ZSET 3
120 #define REDIS_HASH 4
121
122 /* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
125 #define REDIS_ENCODING_RAW 0 /* Raw representation */
126 #define REDIS_ENCODING_INT 1 /* Encoded as integer */
127 #define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128 #define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
129
130 static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132 };
133
134 /* Object types only used for dumping to disk */
135 #define REDIS_EXPIRETIME 253
136 #define REDIS_SELECTDB 254
137 #define REDIS_EOF 255
138
139 /* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
149 *
150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
152 #define REDIS_RDB_6BITLEN 0
153 #define REDIS_RDB_14BITLEN 1
154 #define REDIS_RDB_32BITLEN 2
155 #define REDIS_RDB_ENCVAL 3
156 #define REDIS_RDB_LENERR UINT_MAX
157
158 /* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161 #define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162 #define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163 #define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
164 #define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
165
166 /* Virtual memory object->where field. */
167 #define REDIS_VM_MEMORY 0 /* The object is on memory */
168 #define REDIS_VM_SWAPPED 1 /* The object is on disk */
169 #define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170 #define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
172 /* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174 #define REDIS_VM_MAX_NEAR_PAGES 65536
175 #define REDIS_VM_MAX_RANDOM_JUMP 4096
176 #define REDIS_VM_MAX_THREADS 32
177 #define REDIS_THREAD_STACK_SIZE (1024*1024*4)
178 /* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
182 #define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
183
184 /* Client flags */
185 #define REDIS_SLAVE 1 /* This client is a slave server */
186 #define REDIS_MASTER 2 /* This client is a master server */
187 #define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188 #define REDIS_MULTI 8 /* This client is in a MULTI context */
189 #define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190 #define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
191
192 /* Slave replication state - slave side */
193 #define REDIS_REPL_NONE 0 /* No active replication */
194 #define REDIS_REPL_CONNECT 1 /* Must connect to master */
195 #define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
197 /* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201 #define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202 #define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203 #define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204 #define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
206 /* List related stuff */
207 #define REDIS_HEAD 0
208 #define REDIS_TAIL 1
209
210 /* Sort operations */
211 #define REDIS_SORT_GET 0
212 #define REDIS_SORT_ASC 1
213 #define REDIS_SORT_DESC 2
214 #define REDIS_SORTKEY_MAX 1024
215
216 /* Log levels */
217 #define REDIS_DEBUG 0
218 #define REDIS_VERBOSE 1
219 #define REDIS_NOTICE 2
220 #define REDIS_WARNING 3
221
222 /* Anti-warning macro... */
223 #define REDIS_NOTUSED(V) ((void) V)
224
225 #define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226 #define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
227
228 /* Append only defines */
229 #define APPENDFSYNC_NO 0
230 #define APPENDFSYNC_ALWAYS 1
231 #define APPENDFSYNC_EVERYSEC 2
232
233 /* Hashes related defaults */
234 #define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235 #define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
237 /* We can print the stacktrace, so our assert is defined this way: */
238 #define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
239 static void _redisAssert(char *estr, char *file, int line);
240
241 /*================================= Data types ============================== */
242
243 /* A redis object, that is a type able to hold a string / list / set */
244
245 /* The VM object structure */
246 struct redisObjectVM {
247 off_t page; /* the page at witch the object is stored on disk */
248 off_t usedpages; /* number of pages used on disk */
249 time_t atime; /* Last access time */
250 } vm;
251
252 /* The actual Redis Object */
253 typedef struct redisObject {
254 void *ptr;
255 unsigned char type;
256 unsigned char encoding;
257 unsigned char storage; /* If this object is a key, where is the value?
258 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
259 unsigned char vtype; /* If this object is a key, and value is swapped out,
260 * this is the type of the swapped out object. */
261 int refcount;
262 /* VM fields, this are only allocated if VM is active, otherwise the
263 * object allocation function will just allocate
264 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
265 * Redis without VM active will not have any overhead. */
266 struct redisObjectVM vm;
267 } robj;
268
269 /* Macro used to initalize a Redis object allocated on the stack.
270 * Note that this macro is taken near the structure definition to make sure
271 * we'll update it when the structure is changed, to avoid bugs like
272 * bug #85 introduced exactly in this way. */
273 #define initStaticStringObject(_var,_ptr) do { \
274 _var.refcount = 1; \
275 _var.type = REDIS_STRING; \
276 _var.encoding = REDIS_ENCODING_RAW; \
277 _var.ptr = _ptr; \
278 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
279 } while(0);
280
281 typedef struct redisDb {
282 dict *dict; /* The keyspace for this DB */
283 dict *expires; /* Timeout of keys with a timeout set */
284 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
285 dict *io_keys; /* Keys with clients waiting for VM I/O */
286 int id;
287 } redisDb;
288
289 /* Client MULTI/EXEC state */
290 typedef struct multiCmd {
291 robj **argv;
292 int argc;
293 struct redisCommand *cmd;
294 } multiCmd;
295
296 typedef struct multiState {
297 multiCmd *commands; /* Array of MULTI commands */
298 int count; /* Total number of MULTI commands */
299 } multiState;
300
301 /* With multiplexing we need to take per-clinet state.
302 * Clients are taken in a liked list. */
303 typedef struct redisClient {
304 int fd;
305 redisDb *db;
306 int dictid;
307 sds querybuf;
308 robj **argv, **mbargv;
309 int argc, mbargc;
310 int bulklen; /* bulk read len. -1 if not in bulk read mode */
311 int multibulk; /* multi bulk command format active */
312 list *reply;
313 int sentlen;
314 time_t lastinteraction; /* time of the last interaction, used for timeout */
315 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
316 int slaveseldb; /* slave selected db, if this client is a slave */
317 int authenticated; /* when requirepass is non-NULL */
318 int replstate; /* replication state if this is a slave */
319 int repldbfd; /* replication DB file descriptor */
320 long repldboff; /* replication DB file offset */
321 off_t repldbsize; /* replication DB file size */
322 multiState mstate; /* MULTI/EXEC state */
323 robj **blockingkeys; /* The key we are waiting to terminate a blocking
324 * operation such as BLPOP. Otherwise NULL. */
325 int blockingkeysnum; /* Number of blocking keys */
326 time_t blockingto; /* Blocking operation timeout. If UNIX current time
327 * is >= blockingto then the operation timed out. */
328 list *io_keys; /* Keys this client is waiting to be loaded from the
329 * swap file in order to continue. */
330 dict *pubsub_classes; /* Classes a client is interested in (SUBSCRIBE) */
331 } redisClient;
332
333 struct saveparam {
334 time_t seconds;
335 int changes;
336 };
337
338 /* Global server state structure */
339 struct redisServer {
340 int port;
341 int fd;
342 redisDb *db;
343 dict *sharingpool; /* Poll used for object sharing */
344 unsigned int sharingpoolsize;
345 long long dirty; /* changes to DB from the last save */
346 list *clients;
347 list *slaves, *monitors;
348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
357 long long stat_expiredkeys; /* number of expired keys */
358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
364 int appendonly;
365 int appendfsync;
366 time_t lastfsync;
367 int appendfd;
368 int appendseldb;
369 char *pidfile;
370 pid_t bgsavechildpid;
371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
378 char *appendfilename;
379 char *requirepass;
380 int shareobjects;
381 int rdbcompression;
382 /* Replication related */
383 int isslave;
384 char *masterauth;
385 char *masterhost;
386 int masterport;
387 redisClient *master; /* client that is master for this slave */
388 int replstate;
389 unsigned int maxclients;
390 unsigned long long maxmemory;
391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
398 /* Virtual memory configuration */
399 int vm_enabled;
400 char *vm_swap_file;
401 off_t vm_page_size;
402 off_t vm_pages;
403 unsigned long long vm_max_memory;
404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
413 time_t unixtime; /* Unix time sampled every second. */
414 /* Virtual memory I/O threads stuff */
415 /* An I/O thread process an element taken from the io_jobs queue and
416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
439 /* Pubsub */
440 dict *pubsub_classes; /* Associate classes to list of subscribed clients */
441 /* Misc */
442 FILE *devnull;
443 };
444
445 typedef void redisCommandProc(redisClient *c);
446 struct redisCommand {
447 char *name;
448 redisCommandProc *proc;
449 int arity;
450 int flags;
451 /* Use a function to determine which keys need to be loaded
452 * in the background prior to executing this command. Takes precedence
453 * over vm_firstkey and others, ignored when NULL */
454 redisCommandProc *vm_preload_proc;
455 /* What keys should be loaded in background when calling this command? */
456 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
457 int vm_lastkey; /* THe last argument that's a key */
458 int vm_keystep; /* The step between first and last key */
459 };
460
461 struct redisFunctionSym {
462 char *name;
463 unsigned long pointer;
464 };
465
466 typedef struct _redisSortObject {
467 robj *obj;
468 union {
469 double score;
470 robj *cmpobj;
471 } u;
472 } redisSortObject;
473
474 typedef struct _redisSortOperation {
475 int type;
476 robj *pattern;
477 } redisSortOperation;
478
479 /* ZSETs use a specialized version of Skiplists */
480
481 typedef struct zskiplistNode {
482 struct zskiplistNode **forward;
483 struct zskiplistNode *backward;
484 unsigned int *span;
485 double score;
486 robj *obj;
487 } zskiplistNode;
488
489 typedef struct zskiplist {
490 struct zskiplistNode *header, *tail;
491 unsigned long length;
492 int level;
493 } zskiplist;
494
495 typedef struct zset {
496 dict *dict;
497 zskiplist *zsl;
498 } zset;
499
500 /* Our shared "common" objects */
501
502 struct sharedObjectsStruct {
503 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
504 *colon, *nullbulk, *nullmultibulk, *queued,
505 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
506 *outofrangeerr, *plus,
507 *select0, *select1, *select2, *select3, *select4,
508 *select5, *select6, *select7, *select8, *select9,
509 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3;
510 } shared;
511
512 /* Global vars that are actally used as constants. The following double
513 * values are used for double on-disk serialization, and are initialized
514 * at runtime to avoid strange compiler optimizations. */
515
516 static double R_Zero, R_PosInf, R_NegInf, R_Nan;
517
518 /* VM threaded I/O request message */
519 #define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
520 #define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
521 #define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
522 typedef struct iojob {
523 int type; /* Request type, REDIS_IOJOB_* */
524 redisDb *db;/* Redis database */
525 robj *key; /* This I/O request is about swapping this key */
526 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
527 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
528 off_t page; /* Swap page where to read/write the object */
529 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
530 int canceled; /* True if this command was canceled by blocking side of VM */
531 pthread_t thread; /* ID of the thread processing this entry */
532 } iojob;
533
534 /*================================ Prototypes =============================== */
535
536 static void freeStringObject(robj *o);
537 static void freeListObject(robj *o);
538 static void freeSetObject(robj *o);
539 static void decrRefCount(void *o);
540 static robj *createObject(int type, void *ptr);
541 static void freeClient(redisClient *c);
542 static int rdbLoad(char *filename);
543 static void addReply(redisClient *c, robj *obj);
544 static void addReplySds(redisClient *c, sds s);
545 static void incrRefCount(robj *o);
546 static int rdbSaveBackground(char *filename);
547 static robj *createStringObject(char *ptr, size_t len);
548 static robj *dupStringObject(robj *o);
549 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
550 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
551 static int syncWithMaster(void);
552 static robj *tryObjectSharing(robj *o);
553 static int tryObjectEncoding(robj *o);
554 static robj *getDecodedObject(robj *o);
555 static int removeExpire(redisDb *db, robj *key);
556 static int expireIfNeeded(redisDb *db, robj *key);
557 static int deleteIfVolatile(redisDb *db, robj *key);
558 static int deleteIfSwapped(redisDb *db, robj *key);
559 static int deleteKey(redisDb *db, robj *key);
560 static time_t getExpire(redisDb *db, robj *key);
561 static int setExpire(redisDb *db, robj *key, time_t when);
562 static void updateSlavesWaitingBgsave(int bgsaveerr);
563 static void freeMemoryIfNeeded(void);
564 static int processCommand(redisClient *c);
565 static void setupSigSegvAction(void);
566 static void rdbRemoveTempFile(pid_t childpid);
567 static void aofRemoveTempFile(pid_t childpid);
568 static size_t stringObjectLen(robj *o);
569 static void processInputBuffer(redisClient *c);
570 static zskiplist *zslCreate(void);
571 static void zslFree(zskiplist *zsl);
572 static void zslInsert(zskiplist *zsl, double score, robj *obj);
573 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
574 static void initClientMultiState(redisClient *c);
575 static void freeClientMultiState(redisClient *c);
576 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
577 static void unblockClientWaitingData(redisClient *c);
578 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
579 static void vmInit(void);
580 static void vmMarkPagesFree(off_t page, off_t count);
581 static robj *vmLoadObject(robj *key);
582 static robj *vmPreviewObject(robj *key);
583 static int vmSwapOneObjectBlocking(void);
584 static int vmSwapOneObjectThreaded(void);
585 static int vmCanSwapOut(void);
586 static int tryFreeOneObjectFromFreelist(void);
587 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
588 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
589 static void vmCancelThreadedIOJob(robj *o);
590 static void lockThreadedIO(void);
591 static void unlockThreadedIO(void);
592 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
593 static void freeIOJob(iojob *j);
594 static void queueIOJob(iojob *j);
595 static int vmWriteObjectOnSwap(robj *o, off_t page);
596 static robj *vmReadObjectFromSwap(off_t page, int type);
597 static void waitEmptyIOJobsQueue(void);
598 static void vmReopenSwapFile(void);
599 static int vmFreePage(off_t page);
600 static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
601 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
602 static int dontWaitForSwappedKey(redisClient *c, robj *key);
603 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
604 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
605 static struct redisCommand *lookupCommand(char *name);
606 static void call(redisClient *c, struct redisCommand *cmd);
607 static void resetClient(redisClient *c);
608 static void convertToRealHash(robj *o);
609 static int pubsubUnsubscribeAll(redisClient *c, int notify);
610 static void usage();
611
612 static void authCommand(redisClient *c);
613 static void pingCommand(redisClient *c);
614 static void echoCommand(redisClient *c);
615 static void setCommand(redisClient *c);
616 static void setnxCommand(redisClient *c);
617 static void getCommand(redisClient *c);
618 static void delCommand(redisClient *c);
619 static void existsCommand(redisClient *c);
620 static void incrCommand(redisClient *c);
621 static void decrCommand(redisClient *c);
622 static void incrbyCommand(redisClient *c);
623 static void decrbyCommand(redisClient *c);
624 static void selectCommand(redisClient *c);
625 static void randomkeyCommand(redisClient *c);
626 static void keysCommand(redisClient *c);
627 static void dbsizeCommand(redisClient *c);
628 static void lastsaveCommand(redisClient *c);
629 static void saveCommand(redisClient *c);
630 static void bgsaveCommand(redisClient *c);
631 static void bgrewriteaofCommand(redisClient *c);
632 static void shutdownCommand(redisClient *c);
633 static void moveCommand(redisClient *c);
634 static void renameCommand(redisClient *c);
635 static void renamenxCommand(redisClient *c);
636 static void lpushCommand(redisClient *c);
637 static void rpushCommand(redisClient *c);
638 static void lpopCommand(redisClient *c);
639 static void rpopCommand(redisClient *c);
640 static void llenCommand(redisClient *c);
641 static void lindexCommand(redisClient *c);
642 static void lrangeCommand(redisClient *c);
643 static void ltrimCommand(redisClient *c);
644 static void typeCommand(redisClient *c);
645 static void lsetCommand(redisClient *c);
646 static void saddCommand(redisClient *c);
647 static void sremCommand(redisClient *c);
648 static void smoveCommand(redisClient *c);
649 static void sismemberCommand(redisClient *c);
650 static void scardCommand(redisClient *c);
651 static void spopCommand(redisClient *c);
652 static void srandmemberCommand(redisClient *c);
653 static void sinterCommand(redisClient *c);
654 static void sinterstoreCommand(redisClient *c);
655 static void sunionCommand(redisClient *c);
656 static void sunionstoreCommand(redisClient *c);
657 static void sdiffCommand(redisClient *c);
658 static void sdiffstoreCommand(redisClient *c);
659 static void syncCommand(redisClient *c);
660 static void flushdbCommand(redisClient *c);
661 static void flushallCommand(redisClient *c);
662 static void sortCommand(redisClient *c);
663 static void lremCommand(redisClient *c);
664 static void rpoplpushcommand(redisClient *c);
665 static void infoCommand(redisClient *c);
666 static void mgetCommand(redisClient *c);
667 static void monitorCommand(redisClient *c);
668 static void expireCommand(redisClient *c);
669 static void expireatCommand(redisClient *c);
670 static void getsetCommand(redisClient *c);
671 static void ttlCommand(redisClient *c);
672 static void slaveofCommand(redisClient *c);
673 static void debugCommand(redisClient *c);
674 static void msetCommand(redisClient *c);
675 static void msetnxCommand(redisClient *c);
676 static void zaddCommand(redisClient *c);
677 static void zincrbyCommand(redisClient *c);
678 static void zrangeCommand(redisClient *c);
679 static void zrangebyscoreCommand(redisClient *c);
680 static void zcountCommand(redisClient *c);
681 static void zrevrangeCommand(redisClient *c);
682 static void zcardCommand(redisClient *c);
683 static void zremCommand(redisClient *c);
684 static void zscoreCommand(redisClient *c);
685 static void zremrangebyscoreCommand(redisClient *c);
686 static void multiCommand(redisClient *c);
687 static void execCommand(redisClient *c);
688 static void discardCommand(redisClient *c);
689 static void blpopCommand(redisClient *c);
690 static void brpopCommand(redisClient *c);
691 static void appendCommand(redisClient *c);
692 static void substrCommand(redisClient *c);
693 static void zrankCommand(redisClient *c);
694 static void zrevrankCommand(redisClient *c);
695 static void hsetCommand(redisClient *c);
696 static void hgetCommand(redisClient *c);
697 static void hdelCommand(redisClient *c);
698 static void hlenCommand(redisClient *c);
699 static void zremrangebyrankCommand(redisClient *c);
700 static void zunionCommand(redisClient *c);
701 static void zinterCommand(redisClient *c);
702 static void hkeysCommand(redisClient *c);
703 static void hvalsCommand(redisClient *c);
704 static void hgetallCommand(redisClient *c);
705 static void hexistsCommand(redisClient *c);
706 static void configCommand(redisClient *c);
707 static void hincrbyCommand(redisClient *c);
708 static void subscribeCommand(redisClient *c);
709 static void unsubscribeCommand(redisClient *c);
710 static void publishCommand(redisClient *c);
711
712 /*================================= Globals ================================= */
713
714 /* Global vars */
715 static struct redisServer server; /* server global state */
716 static struct redisCommand cmdTable[] = {
717 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
718 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
719 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
720 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
721 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
722 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
723 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
724 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
725 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
726 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
727 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
728 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
729 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
730 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
731 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
732 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
733 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
734 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
735 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
736 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
738 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
739 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
740 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
742 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
743 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
744 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
748 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
749 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
750 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
751 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
752 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
753 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
755 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
760 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
761 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
768 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
769 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
772 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
773 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
778 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
779 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
782 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
783 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
784 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
785 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
789 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
790 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
791 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
792 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
793 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
795 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
796 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
798 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
799 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
800 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"exec",execCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
803 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
804 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
805 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
814 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"publish",publishCommand,3,REDIS_CMD_BULK,NULL,0,0,0},
817 {NULL,NULL,0,0,NULL,0,0,0}
818 };
819
820 /*============================ Utility functions ============================ */
821
822 /* Glob-style pattern matching. */
823 static int stringmatchlen(const char *pattern, int patternLen,
824 const char *string, int stringLen, int nocase)
825 {
826 while(patternLen) {
827 switch(pattern[0]) {
828 case '*':
829 while (pattern[1] == '*') {
830 pattern++;
831 patternLen--;
832 }
833 if (patternLen == 1)
834 return 1; /* match */
835 while(stringLen) {
836 if (stringmatchlen(pattern+1, patternLen-1,
837 string, stringLen, nocase))
838 return 1; /* match */
839 string++;
840 stringLen--;
841 }
842 return 0; /* no match */
843 break;
844 case '?':
845 if (stringLen == 0)
846 return 0; /* no match */
847 string++;
848 stringLen--;
849 break;
850 case '[':
851 {
852 int not, match;
853
854 pattern++;
855 patternLen--;
856 not = pattern[0] == '^';
857 if (not) {
858 pattern++;
859 patternLen--;
860 }
861 match = 0;
862 while(1) {
863 if (pattern[0] == '\\') {
864 pattern++;
865 patternLen--;
866 if (pattern[0] == string[0])
867 match = 1;
868 } else if (pattern[0] == ']') {
869 break;
870 } else if (patternLen == 0) {
871 pattern--;
872 patternLen++;
873 break;
874 } else if (pattern[1] == '-' && patternLen >= 3) {
875 int start = pattern[0];
876 int end = pattern[2];
877 int c = string[0];
878 if (start > end) {
879 int t = start;
880 start = end;
881 end = t;
882 }
883 if (nocase) {
884 start = tolower(start);
885 end = tolower(end);
886 c = tolower(c);
887 }
888 pattern += 2;
889 patternLen -= 2;
890 if (c >= start && c <= end)
891 match = 1;
892 } else {
893 if (!nocase) {
894 if (pattern[0] == string[0])
895 match = 1;
896 } else {
897 if (tolower((int)pattern[0]) == tolower((int)string[0]))
898 match = 1;
899 }
900 }
901 pattern++;
902 patternLen--;
903 }
904 if (not)
905 match = !match;
906 if (!match)
907 return 0; /* no match */
908 string++;
909 stringLen--;
910 break;
911 }
912 case '\\':
913 if (patternLen >= 2) {
914 pattern++;
915 patternLen--;
916 }
917 /* fall through */
918 default:
919 if (!nocase) {
920 if (pattern[0] != string[0])
921 return 0; /* no match */
922 } else {
923 if (tolower((int)pattern[0]) != tolower((int)string[0]))
924 return 0; /* no match */
925 }
926 string++;
927 stringLen--;
928 break;
929 }
930 pattern++;
931 patternLen--;
932 if (stringLen == 0) {
933 while(*pattern == '*') {
934 pattern++;
935 patternLen--;
936 }
937 break;
938 }
939 }
940 if (patternLen == 0 && stringLen == 0)
941 return 1;
942 return 0;
943 }
944
945 static int stringmatch(const char *pattern, const char *string, int nocase) {
946 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
947 }
948
949 static void redisLog(int level, const char *fmt, ...) {
950 va_list ap;
951 FILE *fp;
952
953 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
954 if (!fp) return;
955
956 va_start(ap, fmt);
957 if (level >= server.verbosity) {
958 char *c = ".-*#";
959 char buf[64];
960 time_t now;
961
962 now = time(NULL);
963 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
964 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
965 vfprintf(fp, fmt, ap);
966 fprintf(fp,"\n");
967 fflush(fp);
968 }
969 va_end(ap);
970
971 if (server.logfile) fclose(fp);
972 }
973
974 /*====================== Hash table type implementation ==================== */
975
976 /* This is an hash table type that uses the SDS dynamic strings libary as
977 * keys and radis objects as values (objects can hold SDS strings,
978 * lists, sets). */
979
980 static void dictVanillaFree(void *privdata, void *val)
981 {
982 DICT_NOTUSED(privdata);
983 zfree(val);
984 }
985
986 static void dictListDestructor(void *privdata, void *val)
987 {
988 DICT_NOTUSED(privdata);
989 listRelease((list*)val);
990 }
991
992 static int sdsDictKeyCompare(void *privdata, const void *key1,
993 const void *key2)
994 {
995 int l1,l2;
996 DICT_NOTUSED(privdata);
997
998 l1 = sdslen((sds)key1);
999 l2 = sdslen((sds)key2);
1000 if (l1 != l2) return 0;
1001 return memcmp(key1, key2, l1) == 0;
1002 }
1003
1004 static void dictRedisObjectDestructor(void *privdata, void *val)
1005 {
1006 DICT_NOTUSED(privdata);
1007
1008 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
1009 decrRefCount(val);
1010 }
1011
1012 static int dictObjKeyCompare(void *privdata, const void *key1,
1013 const void *key2)
1014 {
1015 const robj *o1 = key1, *o2 = key2;
1016 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1017 }
1018
1019 static unsigned int dictObjHash(const void *key) {
1020 const robj *o = key;
1021 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1022 }
1023
1024 static int dictEncObjKeyCompare(void *privdata, const void *key1,
1025 const void *key2)
1026 {
1027 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1028 int cmp;
1029
1030 if (o1->encoding == REDIS_ENCODING_INT &&
1031 o2->encoding == REDIS_ENCODING_INT &&
1032 o1->ptr == o2->ptr) return 1;
1033
1034 o1 = getDecodedObject(o1);
1035 o2 = getDecodedObject(o2);
1036 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1037 decrRefCount(o1);
1038 decrRefCount(o2);
1039 return cmp;
1040 }
1041
1042 static unsigned int dictEncObjHash(const void *key) {
1043 robj *o = (robj*) key;
1044
1045 if (o->encoding == REDIS_ENCODING_RAW) {
1046 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1047 } else {
1048 if (o->encoding == REDIS_ENCODING_INT) {
1049 char buf[32];
1050 int len;
1051
1052 len = snprintf(buf,32,"%ld",(long)o->ptr);
1053 return dictGenHashFunction((unsigned char*)buf, len);
1054 } else {
1055 unsigned int hash;
1056
1057 o = getDecodedObject(o);
1058 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1059 decrRefCount(o);
1060 return hash;
1061 }
1062 }
1063 }
1064
1065 /* Sets type and expires */
1066 static dictType setDictType = {
1067 dictEncObjHash, /* hash function */
1068 NULL, /* key dup */
1069 NULL, /* val dup */
1070 dictEncObjKeyCompare, /* key compare */
1071 dictRedisObjectDestructor, /* key destructor */
1072 NULL /* val destructor */
1073 };
1074
1075 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1076 static dictType zsetDictType = {
1077 dictEncObjHash, /* hash function */
1078 NULL, /* key dup */
1079 NULL, /* val dup */
1080 dictEncObjKeyCompare, /* key compare */
1081 dictRedisObjectDestructor, /* key destructor */
1082 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1083 };
1084
1085 /* Db->dict */
1086 static dictType dbDictType = {
1087 dictObjHash, /* hash function */
1088 NULL, /* key dup */
1089 NULL, /* val dup */
1090 dictObjKeyCompare, /* key compare */
1091 dictRedisObjectDestructor, /* key destructor */
1092 dictRedisObjectDestructor /* val destructor */
1093 };
1094
1095 /* Db->expires */
1096 static dictType keyptrDictType = {
1097 dictObjHash, /* hash function */
1098 NULL, /* key dup */
1099 NULL, /* val dup */
1100 dictObjKeyCompare, /* key compare */
1101 dictRedisObjectDestructor, /* key destructor */
1102 NULL /* val destructor */
1103 };
1104
1105 /* Hash type hash table (note that small hashes are represented with zimpaps) */
1106 static dictType hashDictType = {
1107 dictEncObjHash, /* hash function */
1108 NULL, /* key dup */
1109 NULL, /* val dup */
1110 dictEncObjKeyCompare, /* key compare */
1111 dictRedisObjectDestructor, /* key destructor */
1112 dictRedisObjectDestructor /* val destructor */
1113 };
1114
1115 /* Keylist hash table type has unencoded redis objects as keys and
1116 * lists as values. It's used for blocking operations (BLPOP) and to
1117 * map swapped keys to a list of clients waiting for this keys to be loaded. */
1118 static dictType keylistDictType = {
1119 dictObjHash, /* hash function */
1120 NULL, /* key dup */
1121 NULL, /* val dup */
1122 dictObjKeyCompare, /* key compare */
1123 dictRedisObjectDestructor, /* key destructor */
1124 dictListDestructor /* val destructor */
1125 };
1126
1127 static void version();
1128
1129 /* ========================= Random utility functions ======================= */
1130
1131 /* Redis generally does not try to recover from out of memory conditions
1132 * when allocating objects or strings, it is not clear if it will be possible
1133 * to report this condition to the client since the networking layer itself
1134 * is based on heap allocation for send buffers, so we simply abort.
1135 * At least the code will be simpler to read... */
1136 static void oom(const char *msg) {
1137 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
1138 sleep(1);
1139 abort();
1140 }
1141
1142 /* ====================== Redis server networking stuff ===================== */
1143 static void closeTimedoutClients(void) {
1144 redisClient *c;
1145 listNode *ln;
1146 time_t now = time(NULL);
1147 listIter li;
1148
1149 listRewind(server.clients,&li);
1150 while ((ln = listNext(&li)) != NULL) {
1151 c = listNodeValue(ln);
1152 if (server.maxidletime &&
1153 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
1154 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
1155 (now - c->lastinteraction > server.maxidletime))
1156 {
1157 redisLog(REDIS_VERBOSE,"Closing idle client");
1158 freeClient(c);
1159 } else if (c->flags & REDIS_BLOCKED) {
1160 if (c->blockingto != 0 && c->blockingto < now) {
1161 addReply(c,shared.nullmultibulk);
1162 unblockClientWaitingData(c);
1163 }
1164 }
1165 }
1166 }
1167
1168 static int htNeedsResize(dict *dict) {
1169 long long size, used;
1170
1171 size = dictSlots(dict);
1172 used = dictSize(dict);
1173 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1174 (used*100/size < REDIS_HT_MINFILL));
1175 }
1176
1177 /* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1178 * we resize the hash table to save memory */
1179 static void tryResizeHashTables(void) {
1180 int j;
1181
1182 for (j = 0; j < server.dbnum; j++) {
1183 if (htNeedsResize(server.db[j].dict)) {
1184 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1185 dictResize(server.db[j].dict);
1186 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1187 }
1188 if (htNeedsResize(server.db[j].expires))
1189 dictResize(server.db[j].expires);
1190 }
1191 }
1192
1193 /* A background saving child (BGSAVE) terminated its work. Handle this. */
1194 void backgroundSaveDoneHandler(int statloc) {
1195 int exitcode = WEXITSTATUS(statloc);
1196 int bysignal = WIFSIGNALED(statloc);
1197
1198 if (!bysignal && exitcode == 0) {
1199 redisLog(REDIS_NOTICE,
1200 "Background saving terminated with success");
1201 server.dirty = 0;
1202 server.lastsave = time(NULL);
1203 } else if (!bysignal && exitcode != 0) {
1204 redisLog(REDIS_WARNING, "Background saving error");
1205 } else {
1206 redisLog(REDIS_WARNING,
1207 "Background saving terminated by signal");
1208 rdbRemoveTempFile(server.bgsavechildpid);
1209 }
1210 server.bgsavechildpid = -1;
1211 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1212 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1213 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1214 }
1215
1216 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1217 * Handle this. */
1218 void backgroundRewriteDoneHandler(int statloc) {
1219 int exitcode = WEXITSTATUS(statloc);
1220 int bysignal = WIFSIGNALED(statloc);
1221
1222 if (!bysignal && exitcode == 0) {
1223 int fd;
1224 char tmpfile[256];
1225
1226 redisLog(REDIS_NOTICE,
1227 "Background append only file rewriting terminated with success");
1228 /* Now it's time to flush the differences accumulated by the parent */
1229 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1230 fd = open(tmpfile,O_WRONLY|O_APPEND);
1231 if (fd == -1) {
1232 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1233 goto cleanup;
1234 }
1235 /* Flush our data... */
1236 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1237 (signed) sdslen(server.bgrewritebuf)) {
1238 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1239 close(fd);
1240 goto cleanup;
1241 }
1242 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1243 /* Now our work is to rename the temp file into the stable file. And
1244 * switch the file descriptor used by the server for append only. */
1245 if (rename(tmpfile,server.appendfilename) == -1) {
1246 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1247 close(fd);
1248 goto cleanup;
1249 }
1250 /* Mission completed... almost */
1251 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1252 if (server.appendfd != -1) {
1253 /* If append only is actually enabled... */
1254 close(server.appendfd);
1255 server.appendfd = fd;
1256 fsync(fd);
1257 server.appendseldb = -1; /* Make sure it will issue SELECT */
1258 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1259 } else {
1260 /* If append only is disabled we just generate a dump in this
1261 * format. Why not? */
1262 close(fd);
1263 }
1264 } else if (!bysignal && exitcode != 0) {
1265 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1266 } else {
1267 redisLog(REDIS_WARNING,
1268 "Background append only file rewriting terminated by signal");
1269 }
1270 cleanup:
1271 sdsfree(server.bgrewritebuf);
1272 server.bgrewritebuf = sdsempty();
1273 aofRemoveTempFile(server.bgrewritechildpid);
1274 server.bgrewritechildpid = -1;
1275 }
1276
1277 static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1278 int j, loops = server.cronloops++;
1279 REDIS_NOTUSED(eventLoop);
1280 REDIS_NOTUSED(id);
1281 REDIS_NOTUSED(clientData);
1282
1283 /* We take a cached value of the unix time in the global state because
1284 * with virtual memory and aging there is to store the current time
1285 * in objects at every object access, and accuracy is not needed.
1286 * To access a global var is faster than calling time(NULL) */
1287 server.unixtime = time(NULL);
1288
1289 /* Show some info about non-empty databases */
1290 for (j = 0; j < server.dbnum; j++) {
1291 long long size, used, vkeys;
1292
1293 size = dictSlots(server.db[j].dict);
1294 used = dictSize(server.db[j].dict);
1295 vkeys = dictSize(server.db[j].expires);
1296 if (!(loops % 50) && (used || vkeys)) {
1297 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
1298 /* dictPrintStats(server.dict); */
1299 }
1300 }
1301
1302 /* We don't want to resize the hash tables while a bacground saving
1303 * is in progress: the saving child is created using fork() that is
1304 * implemented with a copy-on-write semantic in most modern systems, so
1305 * if we resize the HT while there is the saving child at work actually
1306 * a lot of memory movements in the parent will cause a lot of pages
1307 * copied. */
1308 if (server.bgsavechildpid == -1 && !(loops % 10)) tryResizeHashTables();
1309
1310 /* Show information about connected clients */
1311 if (!(loops % 50)) {
1312 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
1313 listLength(server.clients)-listLength(server.slaves),
1314 listLength(server.slaves),
1315 zmalloc_used_memory(),
1316 dictSize(server.sharingpool));
1317 }
1318
1319 /* Close connections of timedout clients */
1320 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
1321 closeTimedoutClients();
1322
1323 /* Check if a background saving or AOF rewrite in progress terminated */
1324 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
1325 int statloc;
1326 pid_t pid;
1327
1328 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1329 if (pid == server.bgsavechildpid) {
1330 backgroundSaveDoneHandler(statloc);
1331 } else {
1332 backgroundRewriteDoneHandler(statloc);
1333 }
1334 }
1335 } else {
1336 /* If there is not a background saving in progress check if
1337 * we have to save now */
1338 time_t now = time(NULL);
1339 for (j = 0; j < server.saveparamslen; j++) {
1340 struct saveparam *sp = server.saveparams+j;
1341
1342 if (server.dirty >= sp->changes &&
1343 now-server.lastsave > sp->seconds) {
1344 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1345 sp->changes, sp->seconds);
1346 rdbSaveBackground(server.dbfilename);
1347 break;
1348 }
1349 }
1350 }
1351
1352 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1353 * will use few CPU cycles if there are few expiring keys, otherwise
1354 * it will get more aggressive to avoid that too much memory is used by
1355 * keys that can be removed from the keyspace. */
1356 for (j = 0; j < server.dbnum; j++) {
1357 int expired;
1358 redisDb *db = server.db+j;
1359
1360 /* Continue to expire if at the end of the cycle more than 25%
1361 * of the keys were expired. */
1362 do {
1363 long num = dictSize(db->expires);
1364 time_t now = time(NULL);
1365
1366 expired = 0;
1367 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1368 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1369 while (num--) {
1370 dictEntry *de;
1371 time_t t;
1372
1373 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1374 t = (time_t) dictGetEntryVal(de);
1375 if (now > t) {
1376 deleteKey(db,dictGetEntryKey(de));
1377 expired++;
1378 server.stat_expiredkeys++;
1379 }
1380 }
1381 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1382 }
1383
1384 /* Swap a few keys on disk if we are over the memory limit and VM
1385 * is enbled. Try to free objects from the free list first. */
1386 if (vmCanSwapOut()) {
1387 while (server.vm_enabled && zmalloc_used_memory() >
1388 server.vm_max_memory)
1389 {
1390 int retval;
1391
1392 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
1393 retval = (server.vm_max_threads == 0) ?
1394 vmSwapOneObjectBlocking() :
1395 vmSwapOneObjectThreaded();
1396 if (retval == REDIS_ERR && !(loops % 300) &&
1397 zmalloc_used_memory() >
1398 (server.vm_max_memory+server.vm_max_memory/10))
1399 {
1400 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
1401 }
1402 /* Note that when using threade I/O we free just one object,
1403 * because anyway when the I/O thread in charge to swap this
1404 * object out will finish, the handler of completed jobs
1405 * will try to swap more objects if we are still out of memory. */
1406 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
1407 }
1408 }
1409
1410 /* Check if we should connect to a MASTER */
1411 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
1412 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1413 if (syncWithMaster() == REDIS_OK) {
1414 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1415 }
1416 }
1417 return 100;
1418 }
1419
1420 /* This function gets called every time Redis is entering the
1421 * main loop of the event driven library, that is, before to sleep
1422 * for ready file descriptors. */
1423 static void beforeSleep(struct aeEventLoop *eventLoop) {
1424 REDIS_NOTUSED(eventLoop);
1425
1426 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1427 listIter li;
1428 listNode *ln;
1429
1430 listRewind(server.io_ready_clients,&li);
1431 while((ln = listNext(&li))) {
1432 redisClient *c = ln->value;
1433 struct redisCommand *cmd;
1434
1435 /* Resume the client. */
1436 listDelNode(server.io_ready_clients,ln);
1437 c->flags &= (~REDIS_IO_WAIT);
1438 server.vm_blocked_clients--;
1439 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1440 readQueryFromClient, c);
1441 cmd = lookupCommand(c->argv[0]->ptr);
1442 assert(cmd != NULL);
1443 call(c,cmd);
1444 resetClient(c);
1445 /* There may be more data to process in the input buffer. */
1446 if (c->querybuf && sdslen(c->querybuf) > 0)
1447 processInputBuffer(c);
1448 }
1449 }
1450 }
1451
1452 static void createSharedObjects(void) {
1453 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1454 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1455 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1456 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1457 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1458 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1459 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1460 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1461 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
1462 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
1463 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
1464 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1465 "-ERR Operation against a key holding the wrong kind of value\r\n"));
1466 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1467 "-ERR no such key\r\n"));
1468 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1469 "-ERR syntax error\r\n"));
1470 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1471 "-ERR source and destination objects are the same\r\n"));
1472 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1473 "-ERR index out of range\r\n"));
1474 shared.space = createObject(REDIS_STRING,sdsnew(" "));
1475 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1476 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
1477 shared.select0 = createStringObject("select 0\r\n",10);
1478 shared.select1 = createStringObject("select 1\r\n",10);
1479 shared.select2 = createStringObject("select 2\r\n",10);
1480 shared.select3 = createStringObject("select 3\r\n",10);
1481 shared.select4 = createStringObject("select 4\r\n",10);
1482 shared.select5 = createStringObject("select 5\r\n",10);
1483 shared.select6 = createStringObject("select 6\r\n",10);
1484 shared.select7 = createStringObject("select 7\r\n",10);
1485 shared.select8 = createStringObject("select 8\r\n",10);
1486 shared.select9 = createStringObject("select 9\r\n",10);
1487 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1488 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
1489 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",17);
1490 shared.mbulk3 = createStringObject("*3\r\n",4);
1491 }
1492
1493 static void appendServerSaveParams(time_t seconds, int changes) {
1494 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
1495 server.saveparams[server.saveparamslen].seconds = seconds;
1496 server.saveparams[server.saveparamslen].changes = changes;
1497 server.saveparamslen++;
1498 }
1499
1500 static void resetServerSaveParams() {
1501 zfree(server.saveparams);
1502 server.saveparams = NULL;
1503 server.saveparamslen = 0;
1504 }
1505
1506 static void initServerConfig() {
1507 server.dbnum = REDIS_DEFAULT_DBNUM;
1508 server.port = REDIS_SERVERPORT;
1509 server.verbosity = REDIS_VERBOSE;
1510 server.maxidletime = REDIS_MAXIDLETIME;
1511 server.saveparams = NULL;
1512 server.logfile = NULL; /* NULL = log on standard output */
1513 server.bindaddr = NULL;
1514 server.glueoutputbuf = 1;
1515 server.daemonize = 0;
1516 server.appendonly = 0;
1517 server.appendfsync = APPENDFSYNC_ALWAYS;
1518 server.lastfsync = time(NULL);
1519 server.appendfd = -1;
1520 server.appendseldb = -1; /* Make sure the first time will not match */
1521 server.pidfile = zstrdup("/var/run/redis.pid");
1522 server.dbfilename = zstrdup("dump.rdb");
1523 server.appendfilename = zstrdup("appendonly.aof");
1524 server.requirepass = NULL;
1525 server.shareobjects = 0;
1526 server.rdbcompression = 1;
1527 server.sharingpoolsize = 1024;
1528 server.maxclients = 0;
1529 server.blpop_blocked_clients = 0;
1530 server.maxmemory = 0;
1531 server.vm_enabled = 0;
1532 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1533 server.vm_page_size = 256; /* 256 bytes per page */
1534 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1535 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1536 server.vm_max_threads = 4;
1537 server.vm_blocked_clients = 0;
1538 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1539 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1540
1541 resetServerSaveParams();
1542
1543 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1544 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1545 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1546 /* Replication related */
1547 server.isslave = 0;
1548 server.masterauth = NULL;
1549 server.masterhost = NULL;
1550 server.masterport = 6379;
1551 server.master = NULL;
1552 server.replstate = REDIS_REPL_NONE;
1553
1554 /* Double constants initialization */
1555 R_Zero = 0.0;
1556 R_PosInf = 1.0/R_Zero;
1557 R_NegInf = -1.0/R_Zero;
1558 R_Nan = R_Zero/R_Zero;
1559 }
1560
1561 static void initServer() {
1562 int j;
1563
1564 signal(SIGHUP, SIG_IGN);
1565 signal(SIGPIPE, SIG_IGN);
1566 setupSigSegvAction();
1567
1568 server.devnull = fopen("/dev/null","w");
1569 if (server.devnull == NULL) {
1570 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1571 exit(1);
1572 }
1573 server.clients = listCreate();
1574 server.slaves = listCreate();
1575 server.monitors = listCreate();
1576 server.objfreelist = listCreate();
1577 createSharedObjects();
1578 server.el = aeCreateEventLoop();
1579 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
1580 server.sharingpool = dictCreate(&setDictType,NULL);
1581 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1582 if (server.fd == -1) {
1583 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1584 exit(1);
1585 }
1586 for (j = 0; j < server.dbnum; j++) {
1587 server.db[j].dict = dictCreate(&dbDictType,NULL);
1588 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
1589 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
1590 if (server.vm_enabled)
1591 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
1592 server.db[j].id = j;
1593 }
1594 server.pubsub_classes = dictCreate(&keylistDictType,NULL);
1595 server.cronloops = 0;
1596 server.bgsavechildpid = -1;
1597 server.bgrewritechildpid = -1;
1598 server.bgrewritebuf = sdsempty();
1599 server.lastsave = time(NULL);
1600 server.dirty = 0;
1601 server.stat_numcommands = 0;
1602 server.stat_numconnections = 0;
1603 server.stat_expiredkeys = 0;
1604 server.stat_starttime = time(NULL);
1605 server.unixtime = time(NULL);
1606 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1607 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1608 acceptHandler, NULL) == AE_ERR) oom("creating file event");
1609
1610 if (server.appendonly) {
1611 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1612 if (server.appendfd == -1) {
1613 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1614 strerror(errno));
1615 exit(1);
1616 }
1617 }
1618
1619 if (server.vm_enabled) vmInit();
1620 }
1621
1622 /* Empty the whole database */
1623 static long long emptyDb() {
1624 int j;
1625 long long removed = 0;
1626
1627 for (j = 0; j < server.dbnum; j++) {
1628 removed += dictSize(server.db[j].dict);
1629 dictEmpty(server.db[j].dict);
1630 dictEmpty(server.db[j].expires);
1631 }
1632 return removed;
1633 }
1634
1635 static int yesnotoi(char *s) {
1636 if (!strcasecmp(s,"yes")) return 1;
1637 else if (!strcasecmp(s,"no")) return 0;
1638 else return -1;
1639 }
1640
1641 /* I agree, this is a very rudimental way to load a configuration...
1642 will improve later if the config gets more complex */
1643 static void loadServerConfig(char *filename) {
1644 FILE *fp;
1645 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1646 int linenum = 0;
1647 sds line = NULL;
1648 char *errormsg = "Fatal error, can't open config file '%s'";
1649 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1650 sprintf(errorbuf, errormsg, filename);
1651
1652 if (filename[0] == '-' && filename[1] == '\0')
1653 fp = stdin;
1654 else {
1655 if ((fp = fopen(filename,"r")) == NULL) {
1656 redisLog(REDIS_WARNING, errorbuf);
1657 exit(1);
1658 }
1659 }
1660
1661 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1662 sds *argv;
1663 int argc, j;
1664
1665 linenum++;
1666 line = sdsnew(buf);
1667 line = sdstrim(line," \t\r\n");
1668
1669 /* Skip comments and blank lines*/
1670 if (line[0] == '#' || line[0] == '\0') {
1671 sdsfree(line);
1672 continue;
1673 }
1674
1675 /* Split into arguments */
1676 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1677 sdstolower(argv[0]);
1678
1679 /* Execute config directives */
1680 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
1681 server.maxidletime = atoi(argv[1]);
1682 if (server.maxidletime < 0) {
1683 err = "Invalid timeout value"; goto loaderr;
1684 }
1685 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
1686 server.port = atoi(argv[1]);
1687 if (server.port < 1 || server.port > 65535) {
1688 err = "Invalid port"; goto loaderr;
1689 }
1690 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
1691 server.bindaddr = zstrdup(argv[1]);
1692 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
1693 int seconds = atoi(argv[1]);
1694 int changes = atoi(argv[2]);
1695 if (seconds < 1 || changes < 0) {
1696 err = "Invalid save parameters"; goto loaderr;
1697 }
1698 appendServerSaveParams(seconds,changes);
1699 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
1700 if (chdir(argv[1]) == -1) {
1701 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1702 argv[1], strerror(errno));
1703 exit(1);
1704 }
1705 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1706 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1707 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1708 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1709 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
1710 else {
1711 err = "Invalid log level. Must be one of debug, notice, warning";
1712 goto loaderr;
1713 }
1714 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1715 FILE *logfp;
1716
1717 server.logfile = zstrdup(argv[1]);
1718 if (!strcasecmp(server.logfile,"stdout")) {
1719 zfree(server.logfile);
1720 server.logfile = NULL;
1721 }
1722 if (server.logfile) {
1723 /* Test if we are able to open the file. The server will not
1724 * be able to abort just for this problem later... */
1725 logfp = fopen(server.logfile,"a");
1726 if (logfp == NULL) {
1727 err = sdscatprintf(sdsempty(),
1728 "Can't open the log file: %s", strerror(errno));
1729 goto loaderr;
1730 }
1731 fclose(logfp);
1732 }
1733 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
1734 server.dbnum = atoi(argv[1]);
1735 if (server.dbnum < 1) {
1736 err = "Invalid number of databases"; goto loaderr;
1737 }
1738 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1739 loadServerConfig(argv[1]);
1740 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1741 server.maxclients = atoi(argv[1]);
1742 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1743 server.maxmemory = strtoll(argv[1], NULL, 10);
1744 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
1745 server.masterhost = sdsnew(argv[1]);
1746 server.masterport = atoi(argv[2]);
1747 server.replstate = REDIS_REPL_CONNECT;
1748 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1749 server.masterauth = zstrdup(argv[1]);
1750 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
1751 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
1752 err = "argument must be 'yes' or 'no'"; goto loaderr;
1753 }
1754 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
1755 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1756 err = "argument must be 'yes' or 'no'"; goto loaderr;
1757 }
1758 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1759 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1760 err = "argument must be 'yes' or 'no'"; goto loaderr;
1761 }
1762 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1763 server.sharingpoolsize = atoi(argv[1]);
1764 if (server.sharingpoolsize < 1) {
1765 err = "invalid object sharing pool size"; goto loaderr;
1766 }
1767 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
1768 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
1769 err = "argument must be 'yes' or 'no'"; goto loaderr;
1770 }
1771 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1772 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1773 err = "argument must be 'yes' or 'no'"; goto loaderr;
1774 }
1775 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1776 if (!strcasecmp(argv[1],"no")) {
1777 server.appendfsync = APPENDFSYNC_NO;
1778 } else if (!strcasecmp(argv[1],"always")) {
1779 server.appendfsync = APPENDFSYNC_ALWAYS;
1780 } else if (!strcasecmp(argv[1],"everysec")) {
1781 server.appendfsync = APPENDFSYNC_EVERYSEC;
1782 } else {
1783 err = "argument must be 'no', 'always' or 'everysec'";
1784 goto loaderr;
1785 }
1786 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1787 server.requirepass = zstrdup(argv[1]);
1788 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1789 zfree(server.pidfile);
1790 server.pidfile = zstrdup(argv[1]);
1791 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1792 zfree(server.dbfilename);
1793 server.dbfilename = zstrdup(argv[1]);
1794 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1795 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1796 err = "argument must be 'yes' or 'no'"; goto loaderr;
1797 }
1798 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1799 zfree(server.vm_swap_file);
1800 server.vm_swap_file = zstrdup(argv[1]);
1801 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1802 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1803 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1804 server.vm_page_size = strtoll(argv[1], NULL, 10);
1805 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1806 server.vm_pages = strtoll(argv[1], NULL, 10);
1807 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1808 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1809 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1810 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1811 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1812 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1813 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1814 server.vm_max_threads = strtoll(argv[1], NULL, 10);
1815 } else {
1816 err = "Bad directive or wrong number of arguments"; goto loaderr;
1817 }
1818 for (j = 0; j < argc; j++)
1819 sdsfree(argv[j]);
1820 zfree(argv);
1821 sdsfree(line);
1822 }
1823 if (fp != stdin) fclose(fp);
1824 return;
1825
1826 loaderr:
1827 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1828 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1829 fprintf(stderr, ">>> '%s'\n", line);
1830 fprintf(stderr, "%s\n", err);
1831 exit(1);
1832 }
1833
1834 static void freeClientArgv(redisClient *c) {
1835 int j;
1836
1837 for (j = 0; j < c->argc; j++)
1838 decrRefCount(c->argv[j]);
1839 for (j = 0; j < c->mbargc; j++)
1840 decrRefCount(c->mbargv[j]);
1841 c->argc = 0;
1842 c->mbargc = 0;
1843 }
1844
1845 static void freeClient(redisClient *c) {
1846 listNode *ln;
1847
1848 /* Note that if the client we are freeing is blocked into a blocking
1849 * call, we have to set querybuf to NULL *before* to call
1850 * unblockClientWaitingData() to avoid processInputBuffer() will get
1851 * called. Also it is important to remove the file events after
1852 * this, because this call adds the READABLE event. */
1853 sdsfree(c->querybuf);
1854 c->querybuf = NULL;
1855 if (c->flags & REDIS_BLOCKED)
1856 unblockClientWaitingData(c);
1857
1858 /* Unsubscribe from all the pubsub classes */
1859 pubsubUnsubscribeAll(c,0);
1860 dictRelease(c->pubsub_classes);
1861 /* Obvious cleanup */
1862 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1863 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1864 listRelease(c->reply);
1865 freeClientArgv(c);
1866 close(c->fd);
1867 /* Remove from the list of clients */
1868 ln = listSearchKey(server.clients,c);
1869 redisAssert(ln != NULL);
1870 listDelNode(server.clients,ln);
1871 /* Remove from the list of clients waiting for swapped keys */
1872 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1873 ln = listSearchKey(server.io_ready_clients,c);
1874 if (ln) {
1875 listDelNode(server.io_ready_clients,ln);
1876 server.vm_blocked_clients--;
1877 }
1878 }
1879 while (server.vm_enabled && listLength(c->io_keys)) {
1880 ln = listFirst(c->io_keys);
1881 dontWaitForSwappedKey(c,ln->value);
1882 }
1883 listRelease(c->io_keys);
1884 /* Master/slave cleanup */
1885 if (c->flags & REDIS_SLAVE) {
1886 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1887 close(c->repldbfd);
1888 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1889 ln = listSearchKey(l,c);
1890 redisAssert(ln != NULL);
1891 listDelNode(l,ln);
1892 }
1893 if (c->flags & REDIS_MASTER) {
1894 server.master = NULL;
1895 server.replstate = REDIS_REPL_CONNECT;
1896 }
1897 /* Release memory */
1898 zfree(c->argv);
1899 zfree(c->mbargv);
1900 freeClientMultiState(c);
1901 zfree(c);
1902 }
1903
1904 #define GLUEREPLY_UP_TO (1024)
1905 static void glueReplyBuffersIfNeeded(redisClient *c) {
1906 int copylen = 0;
1907 char buf[GLUEREPLY_UP_TO];
1908 listNode *ln;
1909 listIter li;
1910 robj *o;
1911
1912 listRewind(c->reply,&li);
1913 while((ln = listNext(&li))) {
1914 int objlen;
1915
1916 o = ln->value;
1917 objlen = sdslen(o->ptr);
1918 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1919 memcpy(buf+copylen,o->ptr,objlen);
1920 copylen += objlen;
1921 listDelNode(c->reply,ln);
1922 } else {
1923 if (copylen == 0) return;
1924 break;
1925 }
1926 }
1927 /* Now the output buffer is empty, add the new single element */
1928 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1929 listAddNodeHead(c->reply,o);
1930 }
1931
1932 static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1933 redisClient *c = privdata;
1934 int nwritten = 0, totwritten = 0, objlen;
1935 robj *o;
1936 REDIS_NOTUSED(el);
1937 REDIS_NOTUSED(mask);
1938
1939 /* Use writev() if we have enough buffers to send */
1940 if (!server.glueoutputbuf &&
1941 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1942 !(c->flags & REDIS_MASTER))
1943 {
1944 sendReplyToClientWritev(el, fd, privdata, mask);
1945 return;
1946 }
1947
1948 while(listLength(c->reply)) {
1949 if (server.glueoutputbuf && listLength(c->reply) > 1)
1950 glueReplyBuffersIfNeeded(c);
1951
1952 o = listNodeValue(listFirst(c->reply));
1953 objlen = sdslen(o->ptr);
1954
1955 if (objlen == 0) {
1956 listDelNode(c->reply,listFirst(c->reply));
1957 continue;
1958 }
1959
1960 if (c->flags & REDIS_MASTER) {
1961 /* Don't reply to a master */
1962 nwritten = objlen - c->sentlen;
1963 } else {
1964 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
1965 if (nwritten <= 0) break;
1966 }
1967 c->sentlen += nwritten;
1968 totwritten += nwritten;
1969 /* If we fully sent the object on head go to the next one */
1970 if (c->sentlen == objlen) {
1971 listDelNode(c->reply,listFirst(c->reply));
1972 c->sentlen = 0;
1973 }
1974 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
1975 * bytes, in a single threaded server it's a good idea to serve
1976 * other clients as well, even if a very large request comes from
1977 * super fast link that is always able to accept data (in real world
1978 * scenario think about 'KEYS *' against the loopback interfae) */
1979 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
1980 }
1981 if (nwritten == -1) {
1982 if (errno == EAGAIN) {
1983 nwritten = 0;
1984 } else {
1985 redisLog(REDIS_VERBOSE,
1986 "Error writing to client: %s", strerror(errno));
1987 freeClient(c);
1988 return;
1989 }
1990 }
1991 if (totwritten > 0) c->lastinteraction = time(NULL);
1992 if (listLength(c->reply) == 0) {
1993 c->sentlen = 0;
1994 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1995 }
1996 }
1997
1998 static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1999 {
2000 redisClient *c = privdata;
2001 int nwritten = 0, totwritten = 0, objlen, willwrite;
2002 robj *o;
2003 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2004 int offset, ion = 0;
2005 REDIS_NOTUSED(el);
2006 REDIS_NOTUSED(mask);
2007
2008 listNode *node;
2009 while (listLength(c->reply)) {
2010 offset = c->sentlen;
2011 ion = 0;
2012 willwrite = 0;
2013
2014 /* fill-in the iov[] array */
2015 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2016 o = listNodeValue(node);
2017 objlen = sdslen(o->ptr);
2018
2019 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2020 break;
2021
2022 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2023 break; /* no more iovecs */
2024
2025 iov[ion].iov_base = ((char*)o->ptr) + offset;
2026 iov[ion].iov_len = objlen - offset;
2027 willwrite += objlen - offset;
2028 offset = 0; /* just for the first item */
2029 ion++;
2030 }
2031
2032 if(willwrite == 0)
2033 break;
2034
2035 /* write all collected blocks at once */
2036 if((nwritten = writev(fd, iov, ion)) < 0) {
2037 if (errno != EAGAIN) {
2038 redisLog(REDIS_VERBOSE,
2039 "Error writing to client: %s", strerror(errno));
2040 freeClient(c);
2041 return;
2042 }
2043 break;
2044 }
2045
2046 totwritten += nwritten;
2047 offset = c->sentlen;
2048
2049 /* remove written robjs from c->reply */
2050 while (nwritten && listLength(c->reply)) {
2051 o = listNodeValue(listFirst(c->reply));
2052 objlen = sdslen(o->ptr);
2053
2054 if(nwritten >= objlen - offset) {
2055 listDelNode(c->reply, listFirst(c->reply));
2056 nwritten -= objlen - offset;
2057 c->sentlen = 0;
2058 } else {
2059 /* partial write */
2060 c->sentlen += nwritten;
2061 break;
2062 }
2063 offset = 0;
2064 }
2065 }
2066
2067 if (totwritten > 0)
2068 c->lastinteraction = time(NULL);
2069
2070 if (listLength(c->reply) == 0) {
2071 c->sentlen = 0;
2072 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2073 }
2074 }
2075
2076 static struct redisCommand *lookupCommand(char *name) {
2077 int j = 0;
2078 while(cmdTable[j].name != NULL) {
2079 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
2080 j++;
2081 }
2082 return NULL;
2083 }
2084
2085 /* resetClient prepare the client to process the next command */
2086 static void resetClient(redisClient *c) {
2087 freeClientArgv(c);
2088 c->bulklen = -1;
2089 c->multibulk = 0;
2090 }
2091
2092 /* Call() is the core of Redis execution of a command */
2093 static void call(redisClient *c, struct redisCommand *cmd) {
2094 long long dirty;
2095
2096 dirty = server.dirty;
2097 cmd->proc(c);
2098 if (server.appendonly && server.dirty-dirty)
2099 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2100 if (server.dirty-dirty && listLength(server.slaves))
2101 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
2102 if (listLength(server.monitors))
2103 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
2104 server.stat_numcommands++;
2105 }
2106
2107 /* If this function gets called we already read a whole
2108 * command, argments are in the client argv/argc fields.
2109 * processCommand() execute the command or prepare the
2110 * server for a bulk read from the client.
2111 *
2112 * If 1 is returned the client is still alive and valid and
2113 * and other operations can be performed by the caller. Otherwise
2114 * if 0 is returned the client was destroied (i.e. after QUIT). */
2115 static int processCommand(redisClient *c) {
2116 struct redisCommand *cmd;
2117
2118 /* Free some memory if needed (maxmemory setting) */
2119 if (server.maxmemory) freeMemoryIfNeeded();
2120
2121 /* Handle the multi bulk command type. This is an alternative protocol
2122 * supported by Redis in order to receive commands that are composed of
2123 * multiple binary-safe "bulk" arguments. The latency of processing is
2124 * a bit higher but this allows things like multi-sets, so if this
2125 * protocol is used only for MSET and similar commands this is a big win. */
2126 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2127 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2128 if (c->multibulk <= 0) {
2129 resetClient(c);
2130 return 1;
2131 } else {
2132 decrRefCount(c->argv[c->argc-1]);
2133 c->argc--;
2134 return 1;
2135 }
2136 } else if (c->multibulk) {
2137 if (c->bulklen == -1) {
2138 if (((char*)c->argv[0]->ptr)[0] != '$') {
2139 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2140 resetClient(c);
2141 return 1;
2142 } else {
2143 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2144 decrRefCount(c->argv[0]);
2145 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2146 c->argc--;
2147 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2148 resetClient(c);
2149 return 1;
2150 }
2151 c->argc--;
2152 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2153 return 1;
2154 }
2155 } else {
2156 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2157 c->mbargv[c->mbargc] = c->argv[0];
2158 c->mbargc++;
2159 c->argc--;
2160 c->multibulk--;
2161 if (c->multibulk == 0) {
2162 robj **auxargv;
2163 int auxargc;
2164
2165 /* Here we need to swap the multi-bulk argc/argv with the
2166 * normal argc/argv of the client structure. */
2167 auxargv = c->argv;
2168 c->argv = c->mbargv;
2169 c->mbargv = auxargv;
2170
2171 auxargc = c->argc;
2172 c->argc = c->mbargc;
2173 c->mbargc = auxargc;
2174
2175 /* We need to set bulklen to something different than -1
2176 * in order for the code below to process the command without
2177 * to try to read the last argument of a bulk command as
2178 * a special argument. */
2179 c->bulklen = 0;
2180 /* continue below and process the command */
2181 } else {
2182 c->bulklen = -1;
2183 return 1;
2184 }
2185 }
2186 }
2187 /* -- end of multi bulk commands processing -- */
2188
2189 /* The QUIT command is handled as a special case. Normal command
2190 * procs are unable to close the client connection safely */
2191 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
2192 freeClient(c);
2193 return 0;
2194 }
2195
2196 /* Now lookup the command and check ASAP about trivial error conditions
2197 * such wrong arity, bad command name and so forth. */
2198 cmd = lookupCommand(c->argv[0]->ptr);
2199 if (!cmd) {
2200 addReplySds(c,
2201 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2202 (char*)c->argv[0]->ptr));
2203 resetClient(c);
2204 return 1;
2205 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2206 (c->argc < -cmd->arity)) {
2207 addReplySds(c,
2208 sdscatprintf(sdsempty(),
2209 "-ERR wrong number of arguments for '%s' command\r\n",
2210 cmd->name));
2211 resetClient(c);
2212 return 1;
2213 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2214 /* This is a bulk command, we have to read the last argument yet. */
2215 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2216
2217 decrRefCount(c->argv[c->argc-1]);
2218 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2219 c->argc--;
2220 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2221 resetClient(c);
2222 return 1;
2223 }
2224 c->argc--;
2225 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2226 /* It is possible that the bulk read is already in the
2227 * buffer. Check this condition and handle it accordingly.
2228 * This is just a fast path, alternative to call processInputBuffer().
2229 * It's a good idea since the code is small and this condition
2230 * happens most of the times. */
2231 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2232 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2233 c->argc++;
2234 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2235 } else {
2236 /* Otherwise return... there is to read the last argument
2237 * from the socket. */
2238 return 1;
2239 }
2240 }
2241 /* Let's try to share objects on the command arguments vector */
2242 if (server.shareobjects) {
2243 int j;
2244 for(j = 1; j < c->argc; j++)
2245 c->argv[j] = tryObjectSharing(c->argv[j]);
2246 }
2247 /* Let's try to encode the bulk object to save space. */
2248 if (cmd->flags & REDIS_CMD_BULK)
2249 tryObjectEncoding(c->argv[c->argc-1]);
2250
2251 /* Check if the user is authenticated */
2252 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2253 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2254 resetClient(c);
2255 return 1;
2256 }
2257
2258 /* Handle the maxmemory directive */
2259 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2260 zmalloc_used_memory() > server.maxmemory)
2261 {
2262 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2263 resetClient(c);
2264 return 1;
2265 }
2266
2267 /* Exec the command */
2268 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
2269 queueMultiCommand(c,cmd);
2270 addReply(c,shared.queued);
2271 } else {
2272 if (server.vm_enabled && server.vm_max_threads > 0 &&
2273 blockClientOnSwappedKeys(cmd,c)) return 1;
2274 call(c,cmd);
2275 }
2276
2277 /* Prepare the client for the next command */
2278 resetClient(c);
2279 return 1;
2280 }
2281
2282 static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2283 listNode *ln;
2284 listIter li;
2285 int outc = 0, j;
2286 robj **outv;
2287 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2288 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2289 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2290 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2291 robj *lenobj;
2292
2293 if (argc <= REDIS_STATIC_ARGS) {
2294 outv = static_outv;
2295 } else {
2296 outv = zmalloc(sizeof(robj*)*(argc*3+1));
2297 }
2298
2299 lenobj = createObject(REDIS_STRING,
2300 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2301 lenobj->refcount = 0;
2302 outv[outc++] = lenobj;
2303 for (j = 0; j < argc; j++) {
2304 lenobj = createObject(REDIS_STRING,
2305 sdscatprintf(sdsempty(),"$%lu\r\n",
2306 (unsigned long) stringObjectLen(argv[j])));
2307 lenobj->refcount = 0;
2308 outv[outc++] = lenobj;
2309 outv[outc++] = argv[j];
2310 outv[outc++] = shared.crlf;
2311 }
2312
2313 /* Increment all the refcounts at start and decrement at end in order to
2314 * be sure to free objects if there is no slave in a replication state
2315 * able to be feed with commands */
2316 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
2317 listRewind(slaves,&li);
2318 while((ln = listNext(&li))) {
2319 redisClient *slave = ln->value;
2320
2321 /* Don't feed slaves that are still waiting for BGSAVE to start */
2322 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2323
2324 /* Feed all the other slaves, MONITORs and so on */
2325 if (slave->slaveseldb != dictid) {
2326 robj *selectcmd;
2327
2328 switch(dictid) {
2329 case 0: selectcmd = shared.select0; break;
2330 case 1: selectcmd = shared.select1; break;
2331 case 2: selectcmd = shared.select2; break;
2332 case 3: selectcmd = shared.select3; break;
2333 case 4: selectcmd = shared.select4; break;
2334 case 5: selectcmd = shared.select5; break;
2335 case 6: selectcmd = shared.select6; break;
2336 case 7: selectcmd = shared.select7; break;
2337 case 8: selectcmd = shared.select8; break;
2338 case 9: selectcmd = shared.select9; break;
2339 default:
2340 selectcmd = createObject(REDIS_STRING,
2341 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2342 selectcmd->refcount = 0;
2343 break;
2344 }
2345 addReply(slave,selectcmd);
2346 slave->slaveseldb = dictid;
2347 }
2348 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
2349 }
2350 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2351 if (outv != static_outv) zfree(outv);
2352 }
2353
2354 static void processInputBuffer(redisClient *c) {
2355 again:
2356 /* Before to process the input buffer, make sure the client is not
2357 * waitig for a blocking operation such as BLPOP. Note that the first
2358 * iteration the client is never blocked, otherwise the processInputBuffer
2359 * would not be called at all, but after the execution of the first commands
2360 * in the input buffer the client may be blocked, and the "goto again"
2361 * will try to reiterate. The following line will make it return asap. */
2362 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
2363 if (c->bulklen == -1) {
2364 /* Read the first line of the query */
2365 char *p = strchr(c->querybuf,'\n');
2366 size_t querylen;
2367
2368 if (p) {
2369 sds query, *argv;
2370 int argc, j;
2371
2372 query = c->querybuf;
2373 c->querybuf = sdsempty();
2374 querylen = 1+(p-(query));
2375 if (sdslen(query) > querylen) {
2376 /* leave data after the first line of the query in the buffer */
2377 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2378 }
2379 *p = '\0'; /* remove "\n" */
2380 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2381 sdsupdatelen(query);
2382
2383 /* Now we can split the query in arguments */
2384 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2385 sdsfree(query);
2386
2387 if (c->argv) zfree(c->argv);
2388 c->argv = zmalloc(sizeof(robj*)*argc);
2389
2390 for (j = 0; j < argc; j++) {
2391 if (sdslen(argv[j])) {
2392 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2393 c->argc++;
2394 } else {
2395 sdsfree(argv[j]);
2396 }
2397 }
2398 zfree(argv);
2399 if (c->argc) {
2400 /* Execute the command. If the client is still valid
2401 * after processCommand() return and there is something
2402 * on the query buffer try to process the next command. */
2403 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2404 } else {
2405 /* Nothing to process, argc == 0. Just process the query
2406 * buffer if it's not empty or return to the caller */
2407 if (sdslen(c->querybuf)) goto again;
2408 }
2409 return;
2410 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2411 redisLog(REDIS_VERBOSE, "Client protocol error");
2412 freeClient(c);
2413 return;
2414 }
2415 } else {
2416 /* Bulk read handling. Note that if we are at this point
2417 the client already sent a command terminated with a newline,
2418 we are reading the bulk data that is actually the last
2419 argument of the command. */
2420 int qbl = sdslen(c->querybuf);
2421
2422 if (c->bulklen <= qbl) {
2423 /* Copy everything but the final CRLF as final argument */
2424 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2425 c->argc++;
2426 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2427 /* Process the command. If the client is still valid after
2428 * the processing and there is more data in the buffer
2429 * try to parse it. */
2430 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2431 return;
2432 }
2433 }
2434 }
2435
2436 static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2437 redisClient *c = (redisClient*) privdata;
2438 char buf[REDIS_IOBUF_LEN];
2439 int nread;
2440 REDIS_NOTUSED(el);
2441 REDIS_NOTUSED(mask);
2442
2443 nread = read(fd, buf, REDIS_IOBUF_LEN);
2444 if (nread == -1) {
2445 if (errno == EAGAIN) {
2446 nread = 0;
2447 } else {
2448 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
2449 freeClient(c);
2450 return;
2451 }
2452 } else if (nread == 0) {
2453 redisLog(REDIS_VERBOSE, "Client closed connection");
2454 freeClient(c);
2455 return;
2456 }
2457 if (nread) {
2458 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2459 c->lastinteraction = time(NULL);
2460 } else {
2461 return;
2462 }
2463 processInputBuffer(c);
2464 }
2465
2466 static int selectDb(redisClient *c, int id) {
2467 if (id < 0 || id >= server.dbnum)
2468 return REDIS_ERR;
2469 c->db = &server.db[id];
2470 return REDIS_OK;
2471 }
2472
2473 static void *dupClientReplyValue(void *o) {
2474 incrRefCount((robj*)o);
2475 return o;
2476 }
2477
2478 static redisClient *createClient(int fd) {
2479 redisClient *c = zmalloc(sizeof(*c));
2480
2481 anetNonBlock(NULL,fd);
2482 anetTcpNoDelay(NULL,fd);
2483 if (!c) return NULL;
2484 selectDb(c,0);
2485 c->fd = fd;
2486 c->querybuf = sdsempty();
2487 c->argc = 0;
2488 c->argv = NULL;
2489 c->bulklen = -1;
2490 c->multibulk = 0;
2491 c->mbargc = 0;
2492 c->mbargv = NULL;
2493 c->sentlen = 0;
2494 c->flags = 0;
2495 c->lastinteraction = time(NULL);
2496 c->authenticated = 0;
2497 c->replstate = REDIS_REPL_NONE;
2498 c->reply = listCreate();
2499 listSetFreeMethod(c->reply,decrRefCount);
2500 listSetDupMethod(c->reply,dupClientReplyValue);
2501 c->blockingkeys = NULL;
2502 c->blockingkeysnum = 0;
2503 c->io_keys = listCreate();
2504 c->pubsub_classes = dictCreate(&setDictType,NULL);
2505 listSetFreeMethod(c->io_keys,decrRefCount);
2506 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2507 readQueryFromClient, c) == AE_ERR) {
2508 freeClient(c);
2509 return NULL;
2510 }
2511 listAddNodeTail(server.clients,c);
2512 initClientMultiState(c);
2513 return c;
2514 }
2515
2516 static void addReply(redisClient *c, robj *obj) {
2517 if (listLength(c->reply) == 0 &&
2518 (c->replstate == REDIS_REPL_NONE ||
2519 c->replstate == REDIS_REPL_ONLINE) &&
2520 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2521 sendReplyToClient, c) == AE_ERR) return;
2522
2523 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2524 obj = dupStringObject(obj);
2525 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2526 }
2527 listAddNodeTail(c->reply,getDecodedObject(obj));
2528 }
2529
2530 static void addReplySds(redisClient *c, sds s) {
2531 robj *o = createObject(REDIS_STRING,s);
2532 addReply(c,o);
2533 decrRefCount(o);
2534 }
2535
2536 static void addReplyDouble(redisClient *c, double d) {
2537 char buf[128];
2538
2539 snprintf(buf,sizeof(buf),"%.17g",d);
2540 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
2541 (unsigned long) strlen(buf),buf));
2542 }
2543
2544 static void addReplyLong(redisClient *c, long l) {
2545 char buf[128];
2546 size_t len;
2547
2548 if (l == 0) {
2549 addReply(c,shared.czero);
2550 return;
2551 } else if (l == 1) {
2552 addReply(c,shared.cone);
2553 return;
2554 }
2555 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2556 addReplySds(c,sdsnewlen(buf,len));
2557 }
2558
2559 static void addReplyUlong(redisClient *c, unsigned long ul) {
2560 char buf[128];
2561 size_t len;
2562
2563 if (ul == 0) {
2564 addReply(c,shared.czero);
2565 return;
2566 } else if (ul == 1) {
2567 addReply(c,shared.cone);
2568 return;
2569 }
2570 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2571 addReplySds(c,sdsnewlen(buf,len));
2572 }
2573
2574 static void addReplyBulkLen(redisClient *c, robj *obj) {
2575 size_t len;
2576
2577 if (obj->encoding == REDIS_ENCODING_RAW) {
2578 len = sdslen(obj->ptr);
2579 } else {
2580 long n = (long)obj->ptr;
2581
2582 /* Compute how many bytes will take this integer as a radix 10 string */
2583 len = 1;
2584 if (n < 0) {
2585 len++;
2586 n = -n;
2587 }
2588 while((n = n/10) != 0) {
2589 len++;
2590 }
2591 }
2592 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2593 }
2594
2595 static void addReplyBulk(redisClient *c, robj *obj) {
2596 addReplyBulkLen(c,obj);
2597 addReply(c,obj);
2598 addReply(c,shared.crlf);
2599 }
2600
2601 /* In the CONFIG command we need to add vanilla C string as bulk replies */
2602 static void addReplyBulkCString(redisClient *c, char *s) {
2603 if (s == NULL) {
2604 addReply(c,shared.nullbulk);
2605 } else {
2606 robj *o = createStringObject(s,strlen(s));
2607 addReplyBulk(c,o);
2608 decrRefCount(o);
2609 }
2610 }
2611
2612 static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2613 int cport, cfd;
2614 char cip[128];
2615 redisClient *c;
2616 REDIS_NOTUSED(el);
2617 REDIS_NOTUSED(mask);
2618 REDIS_NOTUSED(privdata);
2619
2620 cfd = anetAccept(server.neterr, fd, cip, &cport);
2621 if (cfd == AE_ERR) {
2622 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
2623 return;
2624 }
2625 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2626 if ((c = createClient(cfd)) == NULL) {
2627 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2628 close(cfd); /* May be already closed, just ingore errors */
2629 return;
2630 }
2631 /* If maxclient directive is set and this is one client more... close the
2632 * connection. Note that we create the client instead to check before
2633 * for this condition, since now the socket is already set in nonblocking
2634 * mode and we can send an error for free using the Kernel I/O */
2635 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2636 char *err = "-ERR max number of clients reached\r\n";
2637
2638 /* That's a best effort error message, don't check write errors */
2639 if (write(c->fd,err,strlen(err)) == -1) {
2640 /* Nothing to do, Just to avoid the warning... */
2641 }
2642 freeClient(c);
2643 return;
2644 }
2645 server.stat_numconnections++;
2646 }
2647
2648 /* ======================= Redis objects implementation ===================== */
2649
2650 static robj *createObject(int type, void *ptr) {
2651 robj *o;
2652
2653 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2654 if (listLength(server.objfreelist)) {
2655 listNode *head = listFirst(server.objfreelist);
2656 o = listNodeValue(head);
2657 listDelNode(server.objfreelist,head);
2658 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2659 } else {
2660 if (server.vm_enabled) {
2661 pthread_mutex_unlock(&server.obj_freelist_mutex);
2662 o = zmalloc(sizeof(*o));
2663 } else {
2664 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2665 }
2666 }
2667 o->type = type;
2668 o->encoding = REDIS_ENCODING_RAW;
2669 o->ptr = ptr;
2670 o->refcount = 1;
2671 if (server.vm_enabled) {
2672 /* Note that this code may run in the context of an I/O thread
2673 * and accessing to server.unixtime in theory is an error
2674 * (no locks). But in practice this is safe, and even if we read
2675 * garbage Redis will not fail, as it's just a statistical info */
2676 o->vm.atime = server.unixtime;
2677 o->storage = REDIS_VM_MEMORY;
2678 }
2679 return o;
2680 }
2681
2682 static robj *createStringObject(char *ptr, size_t len) {
2683 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2684 }
2685
2686 static robj *dupStringObject(robj *o) {
2687 assert(o->encoding == REDIS_ENCODING_RAW);
2688 return createStringObject(o->ptr,sdslen(o->ptr));
2689 }
2690
2691 static robj *createListObject(void) {
2692 list *l = listCreate();
2693
2694 listSetFreeMethod(l,decrRefCount);
2695 return createObject(REDIS_LIST,l);
2696 }
2697
2698 static robj *createSetObject(void) {
2699 dict *d = dictCreate(&setDictType,NULL);
2700 return createObject(REDIS_SET,d);
2701 }
2702
2703 static robj *createHashObject(void) {
2704 /* All the Hashes start as zipmaps. Will be automatically converted
2705 * into hash tables if there are enough elements or big elements
2706 * inside. */
2707 unsigned char *zm = zipmapNew();
2708 robj *o = createObject(REDIS_HASH,zm);
2709 o->encoding = REDIS_ENCODING_ZIPMAP;
2710 return o;
2711 }
2712
2713 static robj *createZsetObject(void) {
2714 zset *zs = zmalloc(sizeof(*zs));
2715
2716 zs->dict = dictCreate(&zsetDictType,NULL);
2717 zs->zsl = zslCreate();
2718 return createObject(REDIS_ZSET,zs);
2719 }
2720
2721 static void freeStringObject(robj *o) {
2722 if (o->encoding == REDIS_ENCODING_RAW) {
2723 sdsfree(o->ptr);
2724 }
2725 }
2726
2727 static void freeListObject(robj *o) {
2728 listRelease((list*) o->ptr);
2729 }
2730
2731 static void freeSetObject(robj *o) {
2732 dictRelease((dict*) o->ptr);
2733 }
2734
2735 static void freeZsetObject(robj *o) {
2736 zset *zs = o->ptr;
2737
2738 dictRelease(zs->dict);
2739 zslFree(zs->zsl);
2740 zfree(zs);
2741 }
2742
2743 static void freeHashObject(robj *o) {
2744 switch (o->encoding) {
2745 case REDIS_ENCODING_HT:
2746 dictRelease((dict*) o->ptr);
2747 break;
2748 case REDIS_ENCODING_ZIPMAP:
2749 zfree(o->ptr);
2750 break;
2751 default:
2752 redisAssert(0);
2753 break;
2754 }
2755 }
2756
2757 static void incrRefCount(robj *o) {
2758 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
2759 o->refcount++;
2760 }
2761
2762 static void decrRefCount(void *obj) {
2763 robj *o = obj;
2764
2765 /* Object is a key of a swapped out value, or in the process of being
2766 * loaded. */
2767 if (server.vm_enabled &&
2768 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2769 {
2770 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2771 redisAssert(o->refcount == 1);
2772 }
2773 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2774 redisAssert(o->type == REDIS_STRING);
2775 freeStringObject(o);
2776 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2777 pthread_mutex_lock(&server.obj_freelist_mutex);
2778 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2779 !listAddNodeHead(server.objfreelist,o))
2780 zfree(o);
2781 pthread_mutex_unlock(&server.obj_freelist_mutex);
2782 server.vm_stats_swapped_objects--;
2783 return;
2784 }
2785 /* Object is in memory, or in the process of being swapped out. */
2786 if (--(o->refcount) == 0) {
2787 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2788 vmCancelThreadedIOJob(obj);
2789 switch(o->type) {
2790 case REDIS_STRING: freeStringObject(o); break;
2791 case REDIS_LIST: freeListObject(o); break;
2792 case REDIS_SET: freeSetObject(o); break;
2793 case REDIS_ZSET: freeZsetObject(o); break;
2794 case REDIS_HASH: freeHashObject(o); break;
2795 default: redisAssert(0); break;
2796 }
2797 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
2798 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2799 !listAddNodeHead(server.objfreelist,o))
2800 zfree(o);
2801 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
2802 }
2803 }
2804
2805 static robj *lookupKey(redisDb *db, robj *key) {
2806 dictEntry *de = dictFind(db->dict,key);
2807 if (de) {
2808 robj *key = dictGetEntryKey(de);
2809 robj *val = dictGetEntryVal(de);
2810
2811 if (server.vm_enabled) {
2812 if (key->storage == REDIS_VM_MEMORY ||
2813 key->storage == REDIS_VM_SWAPPING)
2814 {
2815 /* If we were swapping the object out, stop it, this key
2816 * was requested. */
2817 if (key->storage == REDIS_VM_SWAPPING)
2818 vmCancelThreadedIOJob(key);
2819 /* Update the access time of the key for the aging algorithm. */
2820 key->vm.atime = server.unixtime;
2821 } else {
2822 int notify = (key->storage == REDIS_VM_LOADING);
2823
2824 /* Our value was swapped on disk. Bring it at home. */
2825 redisAssert(val == NULL);
2826 val = vmLoadObject(key);
2827 dictGetEntryVal(de) = val;
2828
2829 /* Clients blocked by the VM subsystem may be waiting for
2830 * this key... */
2831 if (notify) handleClientsBlockedOnSwappedKey(db,key);
2832 }
2833 }
2834 return val;
2835 } else {
2836 return NULL;
2837 }
2838 }
2839
2840 static robj *lookupKeyRead(redisDb *db, robj *key) {
2841 expireIfNeeded(db,key);
2842 return lookupKey(db,key);
2843 }
2844
2845 static robj *lookupKeyWrite(redisDb *db, robj *key) {
2846 deleteIfVolatile(db,key);
2847 return lookupKey(db,key);
2848 }
2849
2850 static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2851 robj *o = lookupKeyRead(c->db, key);
2852 if (!o) addReply(c,reply);
2853 return o;
2854 }
2855
2856 static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2857 robj *o = lookupKeyWrite(c->db, key);
2858 if (!o) addReply(c,reply);
2859 return o;
2860 }
2861
2862 static int checkType(redisClient *c, robj *o, int type) {
2863 if (o->type != type) {
2864 addReply(c,shared.wrongtypeerr);
2865 return 1;
2866 }
2867 return 0;
2868 }
2869
2870 static int deleteKey(redisDb *db, robj *key) {
2871 int retval;
2872
2873 /* We need to protect key from destruction: after the first dictDelete()
2874 * it may happen that 'key' is no longer valid if we don't increment
2875 * it's count. This may happen when we get the object reference directly
2876 * from the hash table with dictRandomKey() or dict iterators */
2877 incrRefCount(key);
2878 if (dictSize(db->expires)) dictDelete(db->expires,key);
2879 retval = dictDelete(db->dict,key);
2880 decrRefCount(key);
2881
2882 return retval == DICT_OK;
2883 }
2884
2885 /* Try to share an object against the shared objects pool */
2886 static robj *tryObjectSharing(robj *o) {
2887 struct dictEntry *de;
2888 unsigned long c;
2889
2890 if (o == NULL || server.shareobjects == 0) return o;
2891
2892 redisAssert(o->type == REDIS_STRING);
2893 de = dictFind(server.sharingpool,o);
2894 if (de) {
2895 robj *shared = dictGetEntryKey(de);
2896
2897 c = ((unsigned long) dictGetEntryVal(de))+1;
2898 dictGetEntryVal(de) = (void*) c;
2899 incrRefCount(shared);
2900 decrRefCount(o);
2901 return shared;
2902 } else {
2903 /* Here we are using a stream algorihtm: Every time an object is
2904 * shared we increment its count, everytime there is a miss we
2905 * recrement the counter of a random object. If this object reaches
2906 * zero we remove the object and put the current object instead. */
2907 if (dictSize(server.sharingpool) >=
2908 server.sharingpoolsize) {
2909 de = dictGetRandomKey(server.sharingpool);
2910 redisAssert(de != NULL);
2911 c = ((unsigned long) dictGetEntryVal(de))-1;
2912 dictGetEntryVal(de) = (void*) c;
2913 if (c == 0) {
2914 dictDelete(server.sharingpool,de->key);
2915 }
2916 } else {
2917 c = 0; /* If the pool is empty we want to add this object */
2918 }
2919 if (c == 0) {
2920 int retval;
2921
2922 retval = dictAdd(server.sharingpool,o,(void*)1);
2923 redisAssert(retval == DICT_OK);
2924 incrRefCount(o);
2925 }
2926 return o;
2927 }
2928 }
2929
2930 /* Check if the nul-terminated string 's' can be represented by a long
2931 * (that is, is a number that fits into long without any other space or
2932 * character before or after the digits).
2933 *
2934 * If so, the function returns REDIS_OK and *longval is set to the value
2935 * of the number. Otherwise REDIS_ERR is returned */
2936 static int isStringRepresentableAsLong(sds s, long *longval) {
2937 char buf[32], *endptr;
2938 long value;
2939 int slen;
2940
2941 value = strtol(s, &endptr, 10);
2942 if (endptr[0] != '\0') return REDIS_ERR;
2943 slen = snprintf(buf,32,"%ld",value);
2944
2945 /* If the number converted back into a string is not identical
2946 * then it's not possible to encode the string as integer */
2947 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2948 if (longval) *longval = value;
2949 return REDIS_OK;
2950 }
2951
2952 /* Try to encode a string object in order to save space */
2953 static int tryObjectEncoding(robj *o) {
2954 long value;
2955 sds s = o->ptr;
2956
2957 if (o->encoding != REDIS_ENCODING_RAW)
2958 return REDIS_ERR; /* Already encoded */
2959
2960 /* It's not save to encode shared objects: shared objects can be shared
2961 * everywhere in the "object space" of Redis. Encoded objects can only
2962 * appear as "values" (and not, for instance, as keys) */
2963 if (o->refcount > 1) return REDIS_ERR;
2964
2965 /* Currently we try to encode only strings */
2966 redisAssert(o->type == REDIS_STRING);
2967
2968 /* Check if we can represent this string as a long integer */
2969 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
2970
2971 /* Ok, this object can be encoded */
2972 o->encoding = REDIS_ENCODING_INT;
2973 sdsfree(o->ptr);
2974 o->ptr = (void*) value;
2975 return REDIS_OK;
2976 }
2977
2978 /* Get a decoded version of an encoded object (returned as a new object).
2979 * If the object is already raw-encoded just increment the ref count. */
2980 static robj *getDecodedObject(robj *o) {
2981 robj *dec;
2982
2983 if (o->encoding == REDIS_ENCODING_RAW) {
2984 incrRefCount(o);
2985 return o;
2986 }
2987 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2988 char buf[32];
2989
2990 snprintf(buf,32,"%ld",(long)o->ptr);
2991 dec = createStringObject(buf,strlen(buf));
2992 return dec;
2993 } else {
2994 redisAssert(1 != 1);
2995 }
2996 }
2997
2998 /* Compare two string objects via strcmp() or alike.
2999 * Note that the objects may be integer-encoded. In such a case we
3000 * use snprintf() to get a string representation of the numbers on the stack
3001 * and compare the strings, it's much faster than calling getDecodedObject().
3002 *
3003 * Important note: if objects are not integer encoded, but binary-safe strings,
3004 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3005 * binary safe. */
3006 static int compareStringObjects(robj *a, robj *b) {
3007 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3008 char bufa[128], bufb[128], *astr, *bstr;
3009 int bothsds = 1;
3010
3011 if (a == b) return 0;
3012 if (a->encoding != REDIS_ENCODING_RAW) {
3013 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3014 astr = bufa;
3015 bothsds = 0;
3016 } else {
3017 astr = a->ptr;
3018 }
3019 if (b->encoding != REDIS_ENCODING_RAW) {
3020 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3021 bstr = bufb;
3022 bothsds = 0;
3023 } else {
3024 bstr = b->ptr;
3025 }
3026 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3027 }
3028
3029 static size_t stringObjectLen(robj *o) {
3030 redisAssert(o->type == REDIS_STRING);
3031 if (o->encoding == REDIS_ENCODING_RAW) {
3032 return sdslen(o->ptr);
3033 } else {
3034 char buf[32];
3035
3036 return snprintf(buf,32,"%ld",(long)o->ptr);
3037 }
3038 }
3039
3040 /*============================ RDB saving/loading =========================== */
3041
3042 static int rdbSaveType(FILE *fp, unsigned char type) {
3043 if (fwrite(&type,1,1,fp) == 0) return -1;
3044 return 0;
3045 }
3046
3047 static int rdbSaveTime(FILE *fp, time_t t) {
3048 int32_t t32 = (int32_t) t;
3049 if (fwrite(&t32,4,1,fp) == 0) return -1;
3050 return 0;
3051 }
3052
3053 /* check rdbLoadLen() comments for more info */
3054 static int rdbSaveLen(FILE *fp, uint32_t len) {
3055 unsigned char buf[2];
3056
3057 if (len < (1<<6)) {
3058 /* Save a 6 bit len */
3059 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3060 if (fwrite(buf,1,1,fp) == 0) return -1;
3061 } else if (len < (1<<14)) {
3062 /* Save a 14 bit len */
3063 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3064 buf[1] = len&0xFF;
3065 if (fwrite(buf,2,1,fp) == 0) return -1;
3066 } else {
3067 /* Save a 32 bit len */
3068 buf[0] = (REDIS_RDB_32BITLEN<<6);
3069 if (fwrite(buf,1,1,fp) == 0) return -1;
3070 len = htonl(len);
3071 if (fwrite(&len,4,1,fp) == 0) return -1;
3072 }
3073 return 0;
3074 }
3075
3076 /* String objects in the form "2391" "-100" without any space and with a
3077 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3078 * encoded as integers to save space */
3079 static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3080 long long value;
3081 char *endptr, buf[32];
3082
3083 /* Check if it's possible to encode this value as a number */
3084 value = strtoll(s, &endptr, 10);
3085 if (endptr[0] != '\0') return 0;
3086 snprintf(buf,32,"%lld",value);
3087
3088 /* If the number converted back into a string is not identical
3089 * then it's not possible to encode the string as integer */
3090 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3091
3092 /* Finally check if it fits in our ranges */
3093 if (value >= -(1<<7) && value <= (1<<7)-1) {
3094 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3095 enc[1] = value&0xFF;
3096 return 2;
3097 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3098 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3099 enc[1] = value&0xFF;
3100 enc[2] = (value>>8)&0xFF;
3101 return 3;
3102 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3103 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3104 enc[1] = value&0xFF;
3105 enc[2] = (value>>8)&0xFF;
3106 enc[3] = (value>>16)&0xFF;
3107 enc[4] = (value>>24)&0xFF;
3108 return 5;
3109 } else {
3110 return 0;
3111 }
3112 }
3113
3114 static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3115 size_t comprlen, outlen;
3116 unsigned char byte;
3117 void *out;
3118
3119 /* We require at least four bytes compression for this to be worth it */
3120 if (len <= 4) return 0;
3121 outlen = len-4;
3122 if ((out = zmalloc(outlen+1)) == NULL) return 0;
3123 comprlen = lzf_compress(s, len, out, outlen);
3124 if (comprlen == 0) {
3125 zfree(out);
3126 return 0;
3127 }
3128 /* Data compressed! Let's save it on disk */
3129 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3130 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3131 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
3132 if (rdbSaveLen(fp,len) == -1) goto writeerr;
3133 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3134 zfree(out);
3135 return comprlen;
3136
3137 writeerr:
3138 zfree(out);
3139 return -1;
3140 }
3141
3142 /* Save a string objet as [len][data] on disk. If the object is a string
3143 * representation of an integer value we try to safe it in a special form */
3144 static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3145 int enclen;
3146
3147 /* Try integer encoding */
3148 if (len <= 11) {
3149 unsigned char buf[5];
3150 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3151 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3152 return 0;
3153 }
3154 }
3155
3156 /* Try LZF compression - under 20 bytes it's unable to compress even
3157 * aaaaaaaaaaaaaaaaaa so skip it */
3158 if (server.rdbcompression && len > 20) {
3159 int retval;
3160
3161 retval = rdbSaveLzfStringObject(fp,s,len);
3162 if (retval == -1) return -1;
3163 if (retval > 0) return 0;
3164 /* retval == 0 means data can't be compressed, save the old way */
3165 }
3166
3167 /* Store verbatim */
3168 if (rdbSaveLen(fp,len) == -1) return -1;
3169 if (len && fwrite(s,len,1,fp) == 0) return -1;
3170 return 0;
3171 }
3172
3173 /* Like rdbSaveStringObjectRaw() but handle encoded objects */
3174 static int rdbSaveStringObject(FILE *fp, robj *obj) {
3175 int retval;
3176
3177 /* Avoid incr/decr ref count business when possible.
3178 * This plays well with copy-on-write given that we are probably
3179 * in a child process (BGSAVE). Also this makes sure key objects
3180 * of swapped objects are not incRefCount-ed (an assert does not allow
3181 * this in order to avoid bugs) */
3182 if (obj->encoding != REDIS_ENCODING_RAW) {
3183 obj = getDecodedObject(obj);
3184 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3185 decrRefCount(obj);
3186 } else {
3187 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3188 }
3189 return retval;
3190 }
3191
3192 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
3193 * 8 bit integer specifing the length of the representation.
3194 * This 8 bit integer has special values in order to specify the following
3195 * conditions:
3196 * 253: not a number
3197 * 254: + inf
3198 * 255: - inf
3199 */
3200 static int rdbSaveDoubleValue(FILE *fp, double val) {
3201 unsigned char buf[128];
3202 int len;
3203
3204 if (isnan(val)) {
3205 buf[0] = 253;
3206 len = 1;
3207 } else if (!isfinite(val)) {
3208 len = 1;
3209 buf[0] = (val < 0) ? 255 : 254;
3210 } else {
3211 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
3212 buf[0] = strlen((char*)buf+1);
3213 len = buf[0]+1;
3214 }
3215 if (fwrite(buf,len,1,fp) == 0) return -1;
3216 return 0;
3217 }
3218
3219 /* Save a Redis object. */
3220 static int rdbSaveObject(FILE *fp, robj *o) {
3221 if (o->type == REDIS_STRING) {
3222 /* Save a string value */
3223 if (rdbSaveStringObject(fp,o) == -1) return -1;
3224 } else if (o->type == REDIS_LIST) {
3225 /* Save a list value */
3226 list *list = o->ptr;
3227 listIter li;
3228 listNode *ln;
3229
3230 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3231 listRewind(list,&li);
3232 while((ln = listNext(&li))) {
3233 robj *eleobj = listNodeValue(ln);
3234
3235 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3236 }
3237 } else if (o->type == REDIS_SET) {
3238 /* Save a set value */
3239 dict *set = o->ptr;
3240 dictIterator *di = dictGetIterator(set);
3241 dictEntry *de;
3242
3243 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3244 while((de = dictNext(di)) != NULL) {
3245 robj *eleobj = dictGetEntryKey(de);
3246
3247 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3248 }
3249 dictReleaseIterator(di);
3250 } else if (o->type == REDIS_ZSET) {
3251 /* Save a set value */
3252 zset *zs = o->ptr;
3253 dictIterator *di = dictGetIterator(zs->dict);
3254 dictEntry *de;
3255
3256 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3257 while((de = dictNext(di)) != NULL) {
3258 robj *eleobj = dictGetEntryKey(de);
3259 double *score = dictGetEntryVal(de);
3260
3261 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3262 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3263 }
3264 dictReleaseIterator(di);
3265 } else if (o->type == REDIS_HASH) {
3266 /* Save a hash value */
3267 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3268 unsigned char *p = zipmapRewind(o->ptr);
3269 unsigned int count = zipmapLen(o->ptr);
3270 unsigned char *key, *val;
3271 unsigned int klen, vlen;
3272
3273 if (rdbSaveLen(fp,count) == -1) return -1;
3274 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3275 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3276 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3277 }
3278 } else {
3279 dictIterator *di = dictGetIterator(o->ptr);
3280 dictEntry *de;
3281
3282 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3283 while((de = dictNext(di)) != NULL) {
3284 robj *key = dictGetEntryKey(de);
3285 robj *val = dictGetEntryVal(de);
3286
3287 if (rdbSaveStringObject(fp,key) == -1) return -1;
3288 if (rdbSaveStringObject(fp,val) == -1) return -1;
3289 }
3290 dictReleaseIterator(di);
3291 }
3292 } else {
3293 redisAssert(0);
3294 }
3295 return 0;
3296 }
3297
3298 /* Return the length the object will have on disk if saved with
3299 * the rdbSaveObject() function. Currently we use a trick to get
3300 * this length with very little changes to the code. In the future
3301 * we could switch to a faster solution. */
3302 static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3303 if (fp == NULL) fp = server.devnull;
3304 rewind(fp);
3305 assert(rdbSaveObject(fp,o) != 1);
3306 return ftello(fp);
3307 }
3308
3309 /* Return the number of pages required to save this object in the swap file */
3310 static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3311 off_t bytes = rdbSavedObjectLen(o,fp);
3312
3313 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3314 }
3315
3316 /* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3317 static int rdbSave(char *filename) {
3318 dictIterator *di = NULL;
3319 dictEntry *de;
3320 FILE *fp;
3321 char tmpfile[256];
3322 int j;
3323 time_t now = time(NULL);
3324
3325 /* Wait for I/O therads to terminate, just in case this is a
3326 * foreground-saving, to avoid seeking the swap file descriptor at the
3327 * same time. */
3328 if (server.vm_enabled)
3329 waitEmptyIOJobsQueue();
3330
3331 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
3332 fp = fopen(tmpfile,"w");
3333 if (!fp) {
3334 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3335 return REDIS_ERR;
3336 }
3337 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
3338 for (j = 0; j < server.dbnum; j++) {
3339 redisDb *db = server.db+j;
3340 dict *d = db->dict;
3341 if (dictSize(d) == 0) continue;
3342 di = dictGetIterator(d);
3343 if (!di) {
3344 fclose(fp);
3345 return REDIS_ERR;
3346 }
3347
3348 /* Write the SELECT DB opcode */
3349 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3350 if (rdbSaveLen(fp,j) == -1) goto werr;
3351
3352 /* Iterate this DB writing every entry */
3353 while((de = dictNext(di)) != NULL) {
3354 robj *key = dictGetEntryKey(de);
3355 robj *o = dictGetEntryVal(de);
3356 time_t expiretime = getExpire(db,key);
3357
3358 /* Save the expire time */
3359 if (expiretime != -1) {
3360 /* If this key is already expired skip it */
3361 if (expiretime < now) continue;
3362 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3363 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3364 }
3365 /* Save the key and associated value. This requires special
3366 * handling if the value is swapped out. */
3367 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3368 key->storage == REDIS_VM_SWAPPING) {
3369 /* Save type, key, value */
3370 if (rdbSaveType(fp,o->type) == -1) goto werr;
3371 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3372 if (rdbSaveObject(fp,o) == -1) goto werr;
3373 } else {
3374 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3375 robj *po;
3376 /* Get a preview of the object in memory */
3377 po = vmPreviewObject(key);
3378 /* Save type, key, value */
3379 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3380 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3381 if (rdbSaveObject(fp,po) == -1) goto werr;
3382 /* Remove the loaded object from memory */
3383 decrRefCount(po);
3384 }
3385 }
3386 dictReleaseIterator(di);
3387 }
3388 /* EOF opcode */
3389 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3390
3391 /* Make sure data will not remain on the OS's output buffers */
3392 fflush(fp);
3393 fsync(fileno(fp));
3394 fclose(fp);
3395
3396 /* Use RENAME to make sure the DB file is changed atomically only
3397 * if the generate DB file is ok. */
3398 if (rename(tmpfile,filename) == -1) {
3399 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
3400 unlink(tmpfile);
3401 return REDIS_ERR;
3402 }
3403 redisLog(REDIS_NOTICE,"DB saved on disk");
3404 server.dirty = 0;
3405 server.lastsave = time(NULL);
3406 return REDIS_OK;
3407
3408 werr:
3409 fclose(fp);
3410 unlink(tmpfile);
3411 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3412 if (di) dictReleaseIterator(di);
3413 return REDIS_ERR;
3414 }
3415
3416 static int rdbSaveBackground(char *filename) {
3417 pid_t childpid;
3418
3419 if (server.bgsavechildpid != -1) return REDIS_ERR;
3420 if (server.vm_enabled) waitEmptyIOJobsQueue();
3421 if ((childpid = fork()) == 0) {
3422 /* Child */
3423 if (server.vm_enabled) vmReopenSwapFile();
3424 close(server.fd);
3425 if (rdbSave(filename) == REDIS_OK) {
3426 _exit(0);
3427 } else {
3428 _exit(1);
3429 }
3430 } else {
3431 /* Parent */
3432 if (childpid == -1) {
3433 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3434 strerror(errno));
3435 return REDIS_ERR;
3436 }
3437 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3438 server.bgsavechildpid = childpid;
3439 return REDIS_OK;
3440 }
3441 return REDIS_OK; /* unreached */
3442 }
3443
3444 static void rdbRemoveTempFile(pid_t childpid) {
3445 char tmpfile[256];
3446
3447 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3448 unlink(tmpfile);
3449 }
3450
3451 static int rdbLoadType(FILE *fp) {
3452 unsigned char type;
3453 if (fread(&type,1,1,fp) == 0) return -1;
3454 return type;
3455 }
3456
3457 static time_t rdbLoadTime(FILE *fp) {
3458 int32_t t32;
3459 if (fread(&t32,4,1,fp) == 0) return -1;
3460 return (time_t) t32;
3461 }
3462
3463 /* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3464 * of this file for a description of how this are stored on disk.
3465 *
3466 * isencoded is set to 1 if the readed length is not actually a length but
3467 * an "encoding type", check the above comments for more info */
3468 static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3469 unsigned char buf[2];
3470 uint32_t len;
3471 int type;
3472
3473 if (isencoded) *isencoded = 0;
3474 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3475 type = (buf[0]&0xC0)>>6;
3476 if (type == REDIS_RDB_6BITLEN) {
3477 /* Read a 6 bit len */
3478 return buf[0]&0x3F;
3479 } else if (type == REDIS_RDB_ENCVAL) {
3480 /* Read a 6 bit len encoding type */
3481 if (isencoded) *isencoded = 1;
3482 return buf[0]&0x3F;
3483 } else if (type == REDIS_RDB_14BITLEN) {
3484 /* Read a 14 bit len */
3485 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3486 return ((buf[0]&0x3F)<<8)|buf[1];
3487 } else {
3488 /* Read a 32 bit len */
3489 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3490 return ntohl(len);
3491 }
3492 }
3493
3494 static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3495 unsigned char enc[4];
3496 long long val;
3497
3498 if (enctype == REDIS_RDB_ENC_INT8) {
3499 if (fread(enc,1,1,fp) == 0) return NULL;
3500 val = (signed char)enc[0];
3501 } else if (enctype == REDIS_RDB_ENC_INT16) {
3502 uint16_t v;
3503 if (fread(enc,2,1,fp) == 0) return NULL;
3504 v = enc[0]|(enc[1]<<8);
3505 val = (int16_t)v;
3506 } else if (enctype == REDIS_RDB_ENC_INT32) {
3507 uint32_t v;
3508 if (fread(enc,4,1,fp) == 0) return NULL;
3509 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3510 val = (int32_t)v;
3511 } else {
3512 val = 0; /* anti-warning */
3513 redisAssert(0);
3514 }
3515 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3516 }
3517
3518 static robj *rdbLoadLzfStringObject(FILE*fp) {
3519 unsigned int len, clen;
3520 unsigned char *c = NULL;
3521 sds val = NULL;
3522
3523 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3524 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3525 if ((c = zmalloc(clen)) == NULL) goto err;
3526 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3527 if (fread(c,clen,1,fp) == 0) goto err;
3528 if (lzf_decompress(c,clen,val,len) == 0) goto err;
3529 zfree(c);
3530 return createObject(REDIS_STRING,val);
3531 err:
3532 zfree(c);
3533 sdsfree(val);
3534 return NULL;
3535 }
3536
3537 static robj *rdbLoadStringObject(FILE*fp) {
3538 int isencoded;
3539 uint32_t len;
3540 sds val;
3541
3542 len = rdbLoadLen(fp,&isencoded);
3543 if (isencoded) {
3544 switch(len) {
3545 case REDIS_RDB_ENC_INT8:
3546 case REDIS_RDB_ENC_INT16:
3547 case REDIS_RDB_ENC_INT32:
3548 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
3549 case REDIS_RDB_ENC_LZF:
3550 return tryObjectSharing(rdbLoadLzfStringObject(fp));
3551 default:
3552 redisAssert(0);
3553 }
3554 }
3555
3556 if (len == REDIS_RDB_LENERR) return NULL;
3557 val = sdsnewlen(NULL,len);
3558 if (len && fread(val,len,1,fp) == 0) {
3559 sdsfree(val);
3560 return NULL;
3561 }
3562 return tryObjectSharing(createObject(REDIS_STRING,val));
3563 }
3564
3565 /* For information about double serialization check rdbSaveDoubleValue() */
3566 static int rdbLoadDoubleValue(FILE *fp, double *val) {
3567 char buf[128];
3568 unsigned char len;
3569
3570 if (fread(&len,1,1,fp) == 0) return -1;
3571 switch(len) {
3572 case 255: *val = R_NegInf; return 0;
3573 case 254: *val = R_PosInf; return 0;
3574 case 253: *val = R_Nan; return 0;
3575 default:
3576 if (fread(buf,len,1,fp) == 0) return -1;
3577 buf[len] = '\0';
3578 sscanf(buf, "%lg", val);
3579 return 0;
3580 }
3581 }
3582
3583 /* Load a Redis object of the specified type from the specified file.
3584 * On success a newly allocated object is returned, otherwise NULL. */
3585 static robj *rdbLoadObject(int type, FILE *fp) {
3586 robj *o;
3587
3588 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3589 if (type == REDIS_STRING) {
3590 /* Read string value */
3591 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3592 tryObjectEncoding(o);
3593 } else if (type == REDIS_LIST || type == REDIS_SET) {
3594 /* Read list/set value */
3595 uint32_t listlen;
3596
3597 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3598 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3599 /* It's faster to expand the dict to the right size asap in order
3600 * to avoid rehashing */
3601 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3602 dictExpand(o->ptr,listlen);
3603 /* Load every single element of the list/set */
3604 while(listlen--) {
3605 robj *ele;
3606
3607 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3608 tryObjectEncoding(ele);
3609 if (type == REDIS_LIST) {
3610 listAddNodeTail((list*)o->ptr,ele);
3611 } else {
3612 dictAdd((dict*)o->ptr,ele,NULL);
3613 }
3614 }
3615 } else if (type == REDIS_ZSET) {
3616 /* Read list/set value */
3617 size_t zsetlen;
3618 zset *zs;
3619
3620 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3621 o = createZsetObject();
3622 zs = o->ptr;
3623 /* Load every single element of the list/set */
3624 while(zsetlen--) {
3625 robj *ele;
3626 double *score = zmalloc(sizeof(double));
3627
3628 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3629 tryObjectEncoding(ele);
3630 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3631 dictAdd(zs->dict,ele,score);
3632 zslInsert(zs->zsl,*score,ele);
3633 incrRefCount(ele); /* added to skiplist */
3634 }
3635 } else if (type == REDIS_HASH) {
3636 size_t hashlen;
3637
3638 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3639 o = createHashObject();
3640 /* Too many entries? Use an hash table. */
3641 if (hashlen > server.hash_max_zipmap_entries)
3642 convertToRealHash(o);
3643 /* Load every key/value, then set it into the zipmap or hash
3644 * table, as needed. */
3645 while(hashlen--) {
3646 robj *key, *val;
3647
3648 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3649 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3650 /* If we are using a zipmap and there are too big values
3651 * the object is converted to real hash table encoding. */
3652 if (o->encoding != REDIS_ENCODING_HT &&
3653 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3654 sdslen(val->ptr) > server.hash_max_zipmap_value))
3655 {
3656 convertToRealHash(o);
3657 }
3658
3659 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3660 unsigned char *zm = o->ptr;
3661
3662 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3663 val->ptr,sdslen(val->ptr),NULL);
3664 o->ptr = zm;
3665 decrRefCount(key);
3666 decrRefCount(val);
3667 } else {
3668 tryObjectEncoding(key);
3669 tryObjectEncoding(val);
3670 dictAdd((dict*)o->ptr,key,val);
3671 }
3672 }
3673 } else {
3674 redisAssert(0);
3675 }
3676 return o;
3677 }
3678
3679 static int rdbLoad(char *filename) {
3680 FILE *fp;
3681 robj *keyobj = NULL;
3682 uint32_t dbid;
3683 int type, retval, rdbver;
3684 dict *d = server.db[0].dict;
3685 redisDb *db = server.db+0;
3686 char buf[1024];
3687 time_t expiretime = -1, now = time(NULL);
3688 long long loadedkeys = 0;
3689
3690 fp = fopen(filename,"r");
3691 if (!fp) return REDIS_ERR;
3692 if (fread(buf,9,1,fp) == 0) goto eoferr;
3693 buf[9] = '\0';
3694 if (memcmp(buf,"REDIS",5) != 0) {
3695 fclose(fp);
3696 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3697 return REDIS_ERR;
3698 }
3699 rdbver = atoi(buf+5);
3700 if (rdbver != 1) {
3701 fclose(fp);
3702 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3703 return REDIS_ERR;
3704 }
3705 while(1) {
3706 robj *o;
3707
3708 /* Read type. */
3709 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3710 if (type == REDIS_EXPIRETIME) {
3711 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3712 /* We read the time so we need to read the object type again */
3713 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3714 }
3715 if (type == REDIS_EOF) break;
3716 /* Handle SELECT DB opcode as a special case */
3717 if (type == REDIS_SELECTDB) {
3718 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3719 goto eoferr;
3720 if (dbid >= (unsigned)server.dbnum) {
3721 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
3722 exit(1);
3723 }
3724 db = server.db+dbid;
3725 d = db->dict;
3726 continue;
3727 }
3728 /* Read key */
3729 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3730 /* Read value */
3731 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
3732 /* Add the new object in the hash table */
3733 retval = dictAdd(d,keyobj,o);
3734 if (retval == DICT_ERR) {
3735 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
3736 exit(1);
3737 }
3738 /* Set the expire time if needed */
3739 if (expiretime != -1) {
3740 setExpire(db,keyobj,expiretime);
3741 /* Delete this key if already expired */
3742 if (expiretime < now) deleteKey(db,keyobj);
3743 expiretime = -1;
3744 }
3745 keyobj = o = NULL;
3746 /* Handle swapping while loading big datasets when VM is on */
3747 loadedkeys++;
3748 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3749 while (zmalloc_used_memory() > server.vm_max_memory) {
3750 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3751 }
3752 }
3753 }
3754 fclose(fp);
3755 return REDIS_OK;
3756
3757 eoferr: /* unexpected end of file is handled here with a fatal exit */
3758 if (keyobj) decrRefCount(keyobj);
3759 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
3760 exit(1);
3761 return REDIS_ERR; /* Just to avoid warning */
3762 }
3763
3764 /*================================== Commands =============================== */
3765
3766 static void authCommand(redisClient *c) {
3767 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
3768 c->authenticated = 1;
3769 addReply(c,shared.ok);
3770 } else {
3771 c->authenticated = 0;
3772 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
3773 }
3774 }
3775
3776 static void pingCommand(redisClient *c) {
3777 addReply(c,shared.pong);
3778 }
3779
3780 static void echoCommand(redisClient *c) {
3781 addReplyBulk(c,c->argv[1]);
3782 }
3783
3784 /*=================================== Strings =============================== */
3785
3786 static void setGenericCommand(redisClient *c, int nx) {
3787 int retval;
3788
3789 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3790 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3791 if (retval == DICT_ERR) {
3792 if (!nx) {
3793 /* If the key is about a swapped value, we want a new key object
3794 * to overwrite the old. So we delete the old key in the database.
3795 * This will also make sure that swap pages about the old object
3796 * will be marked as free. */
3797 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
3798 incrRefCount(c->argv[1]);
3799 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3800 incrRefCount(c->argv[2]);
3801 } else {
3802 addReply(c,shared.czero);
3803 return;
3804 }
3805 } else {
3806 incrRefCount(c->argv[1]);
3807 incrRefCount(c->argv[2]);
3808 }
3809 server.dirty++;
3810 removeExpire(c->db,c->argv[1]);
3811 addReply(c, nx ? shared.cone : shared.ok);
3812 }
3813
3814 static void setCommand(redisClient *c) {
3815 setGenericCommand(c,0);
3816 }
3817
3818 static void setnxCommand(redisClient *c) {
3819 setGenericCommand(c,1);
3820 }
3821
3822 static int getGenericCommand(redisClient *c) {
3823 robj *o;
3824
3825 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3826 return REDIS_OK;
3827
3828 if (o->type != REDIS_STRING) {
3829 addReply(c,shared.wrongtypeerr);
3830 return REDIS_ERR;
3831 } else {
3832 addReplyBulk(c,o);
3833 return REDIS_OK;
3834 }
3835 }
3836
3837 static void getCommand(redisClient *c) {
3838 getGenericCommand(c);
3839 }
3840
3841 static void getsetCommand(redisClient *c) {
3842 if (getGenericCommand(c) == REDIS_ERR) return;
3843 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3844 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3845 } else {
3846 incrRefCount(c->argv[1]);
3847 }
3848 incrRefCount(c->argv[2]);
3849 server.dirty++;
3850 removeExpire(c->db,c->argv[1]);
3851 }
3852
3853 static void mgetCommand(redisClient *c) {
3854 int j;
3855
3856 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3857 for (j = 1; j < c->argc; j++) {
3858 robj *o = lookupKeyRead(c->db,c->argv[j]);
3859 if (o == NULL) {
3860 addReply(c,shared.nullbulk);
3861 } else {
3862 if (o->type != REDIS_STRING) {
3863 addReply(c,shared.nullbulk);
3864 } else {
3865 addReplyBulk(c,o);
3866 }
3867 }
3868 }
3869 }
3870
3871 static void msetGenericCommand(redisClient *c, int nx) {
3872 int j, busykeys = 0;
3873
3874 if ((c->argc % 2) == 0) {
3875 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
3876 return;
3877 }
3878 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3879 * set nothing at all if at least one already key exists. */
3880 if (nx) {
3881 for (j = 1; j < c->argc; j += 2) {
3882 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3883 busykeys++;
3884 }
3885 }
3886 }
3887 if (busykeys) {
3888 addReply(c, shared.czero);
3889 return;
3890 }
3891
3892 for (j = 1; j < c->argc; j += 2) {
3893 int retval;
3894
3895 tryObjectEncoding(c->argv[j+1]);
3896 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3897 if (retval == DICT_ERR) {
3898 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3899 incrRefCount(c->argv[j+1]);
3900 } else {
3901 incrRefCount(c->argv[j]);
3902 incrRefCount(c->argv[j+1]);
3903 }
3904 removeExpire(c->db,c->argv[j]);
3905 }
3906 server.dirty += (c->argc-1)/2;
3907 addReply(c, nx ? shared.cone : shared.ok);
3908 }
3909
3910 static void msetCommand(redisClient *c) {
3911 msetGenericCommand(c,0);
3912 }
3913
3914 static void msetnxCommand(redisClient *c) {
3915 msetGenericCommand(c,1);
3916 }
3917
3918 static void incrDecrCommand(redisClient *c, long long incr) {
3919 long long value;
3920 int retval;
3921 robj *o;
3922
3923 o = lookupKeyWrite(c->db,c->argv[1]);
3924 if (o == NULL) {
3925 value = 0;
3926 } else {
3927 if (o->type != REDIS_STRING) {
3928 value = 0;
3929 } else {
3930 char *eptr;
3931
3932 if (o->encoding == REDIS_ENCODING_RAW)
3933 value = strtoll(o->ptr, &eptr, 10);
3934 else if (o->encoding == REDIS_ENCODING_INT)
3935 value = (long)o->ptr;
3936 else
3937 redisAssert(1 != 1);
3938 }
3939 }
3940
3941 value += incr;
3942 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
3943 tryObjectEncoding(o);
3944 retval = dictAdd(c->db->dict,c->argv[1],o);
3945 if (retval == DICT_ERR) {
3946 dictReplace(c->db->dict,c->argv[1],o);
3947 removeExpire(c->db,c->argv[1]);
3948 } else {
3949 incrRefCount(c->argv[1]);
3950 }
3951 server.dirty++;
3952 addReply(c,shared.colon);
3953 addReply(c,o);
3954 addReply(c,shared.crlf);
3955 }
3956
3957 static void incrCommand(redisClient *c) {
3958 incrDecrCommand(c,1);
3959 }
3960
3961 static void decrCommand(redisClient *c) {
3962 incrDecrCommand(c,-1);
3963 }
3964
3965 static void incrbyCommand(redisClient *c) {
3966 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3967 incrDecrCommand(c,incr);
3968 }
3969
3970 static void decrbyCommand(redisClient *c) {
3971 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
3972 incrDecrCommand(c,-incr);
3973 }
3974
3975 static void appendCommand(redisClient *c) {
3976 int retval;
3977 size_t totlen;
3978 robj *o;
3979
3980 o = lookupKeyWrite(c->db,c->argv[1]);
3981 if (o == NULL) {
3982 /* Create the key */
3983 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3984 incrRefCount(c->argv[1]);
3985 incrRefCount(c->argv[2]);
3986 totlen = stringObjectLen(c->argv[2]);
3987 } else {
3988 dictEntry *de;
3989
3990 de = dictFind(c->db->dict,c->argv[1]);
3991 assert(de != NULL);
3992
3993 o = dictGetEntryVal(de);
3994 if (o->type != REDIS_STRING) {
3995 addReply(c,shared.wrongtypeerr);
3996 return;
3997 }
3998 /* If the object is specially encoded or shared we have to make
3999 * a copy */
4000 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4001 robj *decoded = getDecodedObject(o);
4002
4003 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4004 decrRefCount(decoded);
4005 dictReplace(c->db->dict,c->argv[1],o);
4006 }
4007 /* APPEND! */
4008 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4009 o->ptr = sdscatlen(o->ptr,
4010 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4011 } else {
4012 o->ptr = sdscatprintf(o->ptr, "%ld",
4013 (unsigned long) c->argv[2]->ptr);
4014 }
4015 totlen = sdslen(o->ptr);
4016 }
4017 server.dirty++;
4018 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4019 }
4020
4021 static void substrCommand(redisClient *c) {
4022 robj *o;
4023 long start = atoi(c->argv[2]->ptr);
4024 long end = atoi(c->argv[3]->ptr);
4025 size_t rangelen, strlen;
4026 sds range;
4027
4028 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4029 checkType(c,o,REDIS_STRING)) return;
4030
4031 o = getDecodedObject(o);
4032 strlen = sdslen(o->ptr);
4033
4034 /* convert negative indexes */
4035 if (start < 0) start = strlen+start;
4036 if (end < 0) end = strlen+end;
4037 if (start < 0) start = 0;
4038 if (end < 0) end = 0;
4039
4040 /* indexes sanity checks */
4041 if (start > end || (size_t)start >= strlen) {
4042 /* Out of range start or start > end result in null reply */
4043 addReply(c,shared.nullbulk);
4044 decrRefCount(o);
4045 return;
4046 }
4047 if ((size_t)end >= strlen) end = strlen-1;
4048 rangelen = (end-start)+1;
4049
4050 /* Return the result */
4051 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4052 range = sdsnewlen((char*)o->ptr+start,rangelen);
4053 addReplySds(c,range);
4054 addReply(c,shared.crlf);
4055 decrRefCount(o);
4056 }
4057
4058 /* ========================= Type agnostic commands ========================= */
4059
4060 static void delCommand(redisClient *c) {
4061 int deleted = 0, j;
4062
4063 for (j = 1; j < c->argc; j++) {
4064 if (deleteKey(c->db,c->argv[j])) {
4065 server.dirty++;
4066 deleted++;
4067 }
4068 }
4069 addReplyLong(c,deleted);
4070 }
4071
4072 static void existsCommand(redisClient *c) {
4073 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
4074 }
4075
4076 static void selectCommand(redisClient *c) {
4077 int id = atoi(c->argv[1]->ptr);
4078
4079 if (selectDb(c,id) == REDIS_ERR) {
4080 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
4081 } else {
4082 addReply(c,shared.ok);
4083 }
4084 }
4085
4086 static void randomkeyCommand(redisClient *c) {
4087 dictEntry *de;
4088
4089 while(1) {
4090 de = dictGetRandomKey(c->db->dict);
4091 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
4092 }
4093 if (de == NULL) {
4094 addReply(c,shared.plus);
4095 addReply(c,shared.crlf);
4096 } else {
4097 addReply(c,shared.plus);
4098 addReply(c,dictGetEntryKey(de));
4099 addReply(c,shared.crlf);
4100 }
4101 }
4102
4103 static void keysCommand(redisClient *c) {
4104 dictIterator *di;
4105 dictEntry *de;
4106 sds pattern = c->argv[1]->ptr;
4107 int plen = sdslen(pattern);
4108 unsigned long numkeys = 0;
4109 robj *lenobj = createObject(REDIS_STRING,NULL);
4110
4111 di = dictGetIterator(c->db->dict);
4112 addReply(c,lenobj);
4113 decrRefCount(lenobj);
4114 while((de = dictNext(di)) != NULL) {
4115 robj *keyobj = dictGetEntryKey(de);
4116
4117 sds key = keyobj->ptr;
4118 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4119 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
4120 if (expireIfNeeded(c->db,keyobj) == 0) {
4121 addReplyBulk(c,keyobj);
4122 numkeys++;
4123 }
4124 }
4125 }
4126 dictReleaseIterator(di);
4127 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
4128 }
4129
4130 static void dbsizeCommand(redisClient *c) {
4131 addReplySds(c,
4132 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
4133 }
4134
4135 static void lastsaveCommand(redisClient *c) {
4136 addReplySds(c,
4137 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
4138 }
4139
4140 static void typeCommand(redisClient *c) {
4141 robj *o;
4142 char *type;
4143
4144 o = lookupKeyRead(c->db,c->argv[1]);
4145 if (o == NULL) {
4146 type = "+none";
4147 } else {
4148 switch(o->type) {
4149 case REDIS_STRING: type = "+string"; break;
4150 case REDIS_LIST: type = "+list"; break;
4151 case REDIS_SET: type = "+set"; break;
4152 case REDIS_ZSET: type = "+zset"; break;
4153 case REDIS_HASH: type = "+hash"; break;
4154 default: type = "+unknown"; break;
4155 }
4156 }
4157 addReplySds(c,sdsnew(type));
4158 addReply(c,shared.crlf);
4159 }
4160
4161 static void saveCommand(redisClient *c) {
4162 if (server.bgsavechildpid != -1) {
4163 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4164 return;
4165 }
4166 if (rdbSave(server.dbfilename) == REDIS_OK) {
4167 addReply(c,shared.ok);
4168 } else {
4169 addReply(c,shared.err);
4170 }
4171 }
4172
4173 static void bgsaveCommand(redisClient *c) {
4174 if (server.bgsavechildpid != -1) {
4175 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4176 return;
4177 }
4178 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4179 char *status = "+Background saving started\r\n";
4180 addReplySds(c,sdsnew(status));
4181 } else {
4182 addReply(c,shared.err);
4183 }
4184 }
4185
4186 static void shutdownCommand(redisClient *c) {
4187 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4188 /* Kill the saving child if there is a background saving in progress.
4189 We want to avoid race conditions, for instance our saving child may
4190 overwrite the synchronous saving did by SHUTDOWN. */
4191 if (server.bgsavechildpid != -1) {
4192 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4193 kill(server.bgsavechildpid,SIGKILL);
4194 rdbRemoveTempFile(server.bgsavechildpid);
4195 }
4196 if (server.appendonly) {
4197 /* Append only file: fsync() the AOF and exit */
4198 fsync(server.appendfd);
4199 if (server.vm_enabled) unlink(server.vm_swap_file);
4200 exit(0);
4201 } else {
4202 /* Snapshotting. Perform a SYNC SAVE and exit */
4203 if (rdbSave(server.dbfilename) == REDIS_OK) {
4204 if (server.daemonize)
4205 unlink(server.pidfile);
4206 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4207 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4208 if (server.vm_enabled) unlink(server.vm_swap_file);
4209 exit(0);
4210 } else {
4211 /* Ooops.. error saving! The best we can do is to continue
4212 * operating. Note that if there was a background saving process,
4213 * in the next cron() Redis will be notified that the background
4214 * saving aborted, handling special stuff like slaves pending for
4215 * synchronization... */
4216 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
4217 addReplySds(c,
4218 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4219 }
4220 }
4221 }
4222
4223 static void renameGenericCommand(redisClient *c, int nx) {
4224 robj *o;
4225
4226 /* To use the same key as src and dst is probably an error */
4227 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4228 addReply(c,shared.sameobjecterr);
4229 return;
4230 }
4231
4232 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
4233 return;
4234
4235 incrRefCount(o);
4236 deleteIfVolatile(c->db,c->argv[2]);
4237 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
4238 if (nx) {
4239 decrRefCount(o);
4240 addReply(c,shared.czero);
4241 return;
4242 }
4243 dictReplace(c->db->dict,c->argv[2],o);
4244 } else {
4245 incrRefCount(c->argv[2]);
4246 }
4247 deleteKey(c->db,c->argv[1]);
4248 server.dirty++;
4249 addReply(c,nx ? shared.cone : shared.ok);
4250 }
4251
4252 static void renameCommand(redisClient *c) {
4253 renameGenericCommand(c,0);
4254 }
4255
4256 static void renamenxCommand(redisClient *c) {
4257 renameGenericCommand(c,1);
4258 }
4259
4260 static void moveCommand(redisClient *c) {
4261 robj *o;
4262 redisDb *src, *dst;
4263 int srcid;
4264
4265 /* Obtain source and target DB pointers */
4266 src = c->db;
4267 srcid = c->db->id;
4268 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4269 addReply(c,shared.outofrangeerr);
4270 return;
4271 }
4272 dst = c->db;
4273 selectDb(c,srcid); /* Back to the source DB */
4274
4275 /* If the user is moving using as target the same
4276 * DB as the source DB it is probably an error. */
4277 if (src == dst) {
4278 addReply(c,shared.sameobjecterr);
4279 return;
4280 }
4281
4282 /* Check if the element exists and get a reference */
4283 o = lookupKeyWrite(c->db,c->argv[1]);
4284 if (!o) {
4285 addReply(c,shared.czero);
4286 return;
4287 }
4288
4289 /* Try to add the element to the target DB */
4290 deleteIfVolatile(dst,c->argv[1]);
4291 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4292 addReply(c,shared.czero);
4293 return;
4294 }
4295 incrRefCount(c->argv[1]);
4296 incrRefCount(o);
4297
4298 /* OK! key moved, free the entry in the source DB */
4299 deleteKey(src,c->argv[1]);
4300 server.dirty++;
4301 addReply(c,shared.cone);
4302 }
4303
4304 /* =================================== Lists ================================ */
4305 static void pushGenericCommand(redisClient *c, int where) {
4306 robj *lobj;
4307 list *list;
4308
4309 lobj = lookupKeyWrite(c->db,c->argv[1]);
4310 if (lobj == NULL) {
4311 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4312 addReply(c,shared.cone);
4313 return;
4314 }
4315 lobj = createListObject();
4316 list = lobj->ptr;
4317 if (where == REDIS_HEAD) {
4318 listAddNodeHead(list,c->argv[2]);
4319 } else {
4320 listAddNodeTail(list,c->argv[2]);
4321 }
4322 dictAdd(c->db->dict,c->argv[1],lobj);
4323 incrRefCount(c->argv[1]);
4324 incrRefCount(c->argv[2]);
4325 } else {
4326 if (lobj->type != REDIS_LIST) {
4327 addReply(c,shared.wrongtypeerr);
4328 return;
4329 }
4330 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4331 addReply(c,shared.cone);
4332 return;
4333 }
4334 list = lobj->ptr;
4335 if (where == REDIS_HEAD) {
4336 listAddNodeHead(list,c->argv[2]);
4337 } else {
4338 listAddNodeTail(list,c->argv[2]);
4339 }
4340 incrRefCount(c->argv[2]);
4341 }
4342 server.dirty++;
4343 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
4344 }
4345
4346 static void lpushCommand(redisClient *c) {
4347 pushGenericCommand(c,REDIS_HEAD);
4348 }
4349
4350 static void rpushCommand(redisClient *c) {
4351 pushGenericCommand(c,REDIS_TAIL);
4352 }
4353
4354 static void llenCommand(redisClient *c) {
4355 robj *o;
4356 list *l;
4357
4358 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4359 checkType(c,o,REDIS_LIST)) return;
4360
4361 l = o->ptr;
4362 addReplyUlong(c,listLength(l));
4363 }
4364
4365 static void lindexCommand(redisClient *c) {
4366 robj *o;
4367 int index = atoi(c->argv[2]->ptr);
4368 list *list;
4369 listNode *ln;
4370
4371 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4372 checkType(c,o,REDIS_LIST)) return;
4373 list = o->ptr;
4374
4375 ln = listIndex(list, index);
4376 if (ln == NULL) {
4377 addReply(c,shared.nullbulk);
4378 } else {
4379 robj *ele = listNodeValue(ln);
4380 addReplyBulk(c,ele);
4381 }
4382 }
4383
4384 static void lsetCommand(redisClient *c) {
4385 robj *o;
4386 int index = atoi(c->argv[2]->ptr);
4387 list *list;
4388 listNode *ln;
4389
4390 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4391 checkType(c,o,REDIS_LIST)) return;
4392 list = o->ptr;
4393
4394 ln = listIndex(list, index);
4395 if (ln == NULL) {
4396 addReply(c,shared.outofrangeerr);
4397 } else {
4398 robj *ele = listNodeValue(ln);
4399
4400 decrRefCount(ele);
4401 listNodeValue(ln) = c->argv[3];
4402 incrRefCount(c->argv[3]);
4403 addReply(c,shared.ok);
4404 server.dirty++;
4405 }
4406 }
4407
4408 static void popGenericCommand(redisClient *c, int where) {
4409 robj *o;
4410 list *list;
4411 listNode *ln;
4412
4413 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4414 checkType(c,o,REDIS_LIST)) return;
4415 list = o->ptr;
4416
4417 if (where == REDIS_HEAD)
4418 ln = listFirst(list);
4419 else
4420 ln = listLast(list);
4421
4422 if (ln == NULL) {
4423 addReply(c,shared.nullbulk);
4424 } else {
4425 robj *ele = listNodeValue(ln);
4426 addReplyBulk(c,ele);
4427 listDelNode(list,ln);
4428 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4429 server.dirty++;
4430 }
4431 }
4432
4433 static void lpopCommand(redisClient *c) {
4434 popGenericCommand(c,REDIS_HEAD);
4435 }
4436
4437 static void rpopCommand(redisClient *c) {
4438 popGenericCommand(c,REDIS_TAIL);
4439 }
4440
4441 static void lrangeCommand(redisClient *c) {
4442 robj *o;
4443 int start = atoi(c->argv[2]->ptr);
4444 int end = atoi(c->argv[3]->ptr);
4445 int llen;
4446 int rangelen, j;
4447 list *list;
4448 listNode *ln;
4449 robj *ele;
4450
4451 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4452 checkType(c,o,REDIS_LIST)) return;
4453 list = o->ptr;
4454 llen = listLength(list);
4455
4456 /* convert negative indexes */
4457 if (start < 0) start = llen+start;
4458 if (end < 0) end = llen+end;
4459 if (start < 0) start = 0;
4460 if (end < 0) end = 0;
4461
4462 /* indexes sanity checks */
4463 if (start > end || start >= llen) {
4464 /* Out of range start or start > end result in empty list */
4465 addReply(c,shared.emptymultibulk);
4466 return;
4467 }
4468 if (end >= llen) end = llen-1;
4469 rangelen = (end-start)+1;
4470
4471 /* Return the result in form of a multi-bulk reply */
4472 ln = listIndex(list, start);
4473 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4474 for (j = 0; j < rangelen; j++) {
4475 ele = listNodeValue(ln);
4476 addReplyBulk(c,ele);
4477 ln = ln->next;
4478 }
4479 }
4480
4481 static void ltrimCommand(redisClient *c) {
4482 robj *o;
4483 int start = atoi(c->argv[2]->ptr);
4484 int end = atoi(c->argv[3]->ptr);
4485 int llen;
4486 int j, ltrim, rtrim;
4487 list *list;
4488 listNode *ln;
4489
4490 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4491 checkType(c,o,REDIS_LIST)) return;
4492 list = o->ptr;
4493 llen = listLength(list);
4494
4495 /* convert negative indexes */
4496 if (start < 0) start = llen+start;
4497 if (end < 0) end = llen+end;
4498 if (start < 0) start = 0;
4499 if (end < 0) end = 0;
4500
4501 /* indexes sanity checks */
4502 if (start > end || start >= llen) {
4503 /* Out of range start or start > end result in empty list */
4504 ltrim = llen;
4505 rtrim = 0;
4506 } else {
4507 if (end >= llen) end = llen-1;
4508 ltrim = start;
4509 rtrim = llen-end-1;
4510 }
4511
4512 /* Remove list elements to perform the trim */
4513 for (j = 0; j < ltrim; j++) {
4514 ln = listFirst(list);
4515 listDelNode(list,ln);
4516 }
4517 for (j = 0; j < rtrim; j++) {
4518 ln = listLast(list);
4519 listDelNode(list,ln);
4520 }
4521 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4522 server.dirty++;
4523 addReply(c,shared.ok);
4524 }
4525
4526 static void lremCommand(redisClient *c) {
4527 robj *o;
4528 list *list;
4529 listNode *ln, *next;
4530 int toremove = atoi(c->argv[2]->ptr);
4531 int removed = 0;
4532 int fromtail = 0;
4533
4534 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4535 checkType(c,o,REDIS_LIST)) return;
4536 list = o->ptr;
4537
4538 if (toremove < 0) {
4539 toremove = -toremove;
4540 fromtail = 1;
4541 }
4542 ln = fromtail ? list->tail : list->head;
4543 while (ln) {
4544 robj *ele = listNodeValue(ln);
4545
4546 next = fromtail ? ln->prev : ln->next;
4547 if (compareStringObjects(ele,c->argv[3]) == 0) {
4548 listDelNode(list,ln);
4549 server.dirty++;
4550 removed++;
4551 if (toremove && removed == toremove) break;
4552 }
4553 ln = next;
4554 }
4555 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
4556 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
4557 }
4558
4559 /* This is the semantic of this command:
4560 * RPOPLPUSH srclist dstlist:
4561 * IF LLEN(srclist) > 0
4562 * element = RPOP srclist
4563 * LPUSH dstlist element
4564 * RETURN element
4565 * ELSE
4566 * RETURN nil
4567 * END
4568 * END
4569 *
4570 * The idea is to be able to get an element from a list in a reliable way
4571 * since the element is not just returned but pushed against another list
4572 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4573 */
4574 static void rpoplpushcommand(redisClient *c) {
4575 robj *sobj;
4576 list *srclist;
4577 listNode *ln;
4578
4579 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4580 checkType(c,sobj,REDIS_LIST)) return;
4581 srclist = sobj->ptr;
4582 ln = listLast(srclist);
4583
4584 if (ln == NULL) {
4585 addReply(c,shared.nullbulk);
4586 } else {
4587 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4588 robj *ele = listNodeValue(ln);
4589 list *dstlist;
4590
4591 if (dobj && dobj->type != REDIS_LIST) {
4592 addReply(c,shared.wrongtypeerr);
4593 return;
4594 }
4595
4596 /* Add the element to the target list (unless it's directly
4597 * passed to some BLPOP-ing client */
4598 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4599 if (dobj == NULL) {
4600 /* Create the list if the key does not exist */
4601 dobj = createListObject();
4602 dictAdd(c->db->dict,c->argv[2],dobj);
4603 incrRefCount(c->argv[2]);
4604 }
4605 dstlist = dobj->ptr;
4606 listAddNodeHead(dstlist,ele);
4607 incrRefCount(ele);
4608 }
4609
4610 /* Send the element to the client as reply as well */
4611 addReplyBulk(c,ele);
4612
4613 /* Finally remove the element from the source list */
4614 listDelNode(srclist,ln);
4615 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
4616 server.dirty++;
4617 }
4618 }
4619
4620 /* ==================================== Sets ================================ */
4621
4622 static void saddCommand(redisClient *c) {
4623 robj *set;
4624
4625 set = lookupKeyWrite(c->db,c->argv[1]);
4626 if (set == NULL) {
4627 set = createSetObject();
4628 dictAdd(c->db->dict,c->argv[1],set);
4629 incrRefCount(c->argv[1]);
4630 } else {
4631 if (set->type != REDIS_SET) {
4632 addReply(c,shared.wrongtypeerr);
4633 return;
4634 }
4635 }
4636 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4637 incrRefCount(c->argv[2]);
4638 server.dirty++;
4639 addReply(c,shared.cone);
4640 } else {
4641 addReply(c,shared.czero);
4642 }
4643 }
4644
4645 static void sremCommand(redisClient *c) {
4646 robj *set;
4647
4648 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4649 checkType(c,set,REDIS_SET)) return;
4650
4651 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4652 server.dirty++;
4653 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4654 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4655 addReply(c,shared.cone);
4656 } else {
4657 addReply(c,shared.czero);
4658 }
4659 }
4660
4661 static void smoveCommand(redisClient *c) {
4662 robj *srcset, *dstset;
4663
4664 srcset = lookupKeyWrite(c->db,c->argv[1]);
4665 dstset = lookupKeyWrite(c->db,c->argv[2]);
4666
4667 /* If the source key does not exist return 0, if it's of the wrong type
4668 * raise an error */
4669 if (srcset == NULL || srcset->type != REDIS_SET) {
4670 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4671 return;
4672 }
4673 /* Error if the destination key is not a set as well */
4674 if (dstset && dstset->type != REDIS_SET) {
4675 addReply(c,shared.wrongtypeerr);
4676 return;
4677 }
4678 /* Remove the element from the source set */
4679 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4680 /* Key not found in the src set! return zero */
4681 addReply(c,shared.czero);
4682 return;
4683 }
4684 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4685 deleteKey(c->db,c->argv[1]);
4686 server.dirty++;
4687 /* Add the element to the destination set */
4688 if (!dstset) {
4689 dstset = createSetObject();
4690 dictAdd(c->db->dict,c->argv[2],dstset);
4691 incrRefCount(c->argv[2]);
4692 }
4693 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4694 incrRefCount(c->argv[3]);
4695 addReply(c,shared.cone);
4696 }
4697
4698 static void sismemberCommand(redisClient *c) {
4699 robj *set;
4700
4701 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4702 checkType(c,set,REDIS_SET)) return;
4703
4704 if (dictFind(set->ptr,c->argv[2]))
4705 addReply(c,shared.cone);
4706 else
4707 addReply(c,shared.czero);
4708 }
4709
4710 static void scardCommand(redisClient *c) {
4711 robj *o;
4712 dict *s;
4713
4714 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4715 checkType(c,o,REDIS_SET)) return;
4716
4717 s = o->ptr;
4718 addReplyUlong(c,dictSize(s));
4719 }
4720
4721 static void spopCommand(redisClient *c) {
4722 robj *set;
4723 dictEntry *de;
4724
4725 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4726 checkType(c,set,REDIS_SET)) return;
4727
4728 de = dictGetRandomKey(set->ptr);
4729 if (de == NULL) {
4730 addReply(c,shared.nullbulk);
4731 } else {
4732 robj *ele = dictGetEntryKey(de);
4733
4734 addReplyBulk(c,ele);
4735 dictDelete(set->ptr,ele);
4736 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4737 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
4738 server.dirty++;
4739 }
4740 }
4741
4742 static void srandmemberCommand(redisClient *c) {
4743 robj *set;
4744 dictEntry *de;
4745
4746 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4747 checkType(c,set,REDIS_SET)) return;
4748
4749 de = dictGetRandomKey(set->ptr);
4750 if (de == NULL) {
4751 addReply(c,shared.nullbulk);
4752 } else {
4753 robj *ele = dictGetEntryKey(de);
4754
4755 addReplyBulk(c,ele);
4756 }
4757 }
4758
4759 static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4760 dict **d1 = (void*) s1, **d2 = (void*) s2;
4761
4762 return dictSize(*d1)-dictSize(*d2);
4763 }
4764
4765 static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
4766 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4767 dictIterator *di;
4768 dictEntry *de;
4769 robj *lenobj = NULL, *dstset = NULL;
4770 unsigned long j, cardinality = 0;
4771
4772 for (j = 0; j < setsnum; j++) {
4773 robj *setobj;
4774
4775 setobj = dstkey ?
4776 lookupKeyWrite(c->db,setskeys[j]) :
4777 lookupKeyRead(c->db,setskeys[j]);
4778 if (!setobj) {
4779 zfree(dv);
4780 if (dstkey) {
4781 if (deleteKey(c->db,dstkey))
4782 server.dirty++;
4783 addReply(c,shared.czero);
4784 } else {
4785 addReply(c,shared.nullmultibulk);
4786 }
4787 return;
4788 }
4789 if (setobj->type != REDIS_SET) {
4790 zfree(dv);
4791 addReply(c,shared.wrongtypeerr);
4792 return;
4793 }
4794 dv[j] = setobj->ptr;
4795 }
4796 /* Sort sets from the smallest to largest, this will improve our
4797 * algorithm's performace */
4798 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4799
4800 /* The first thing we should output is the total number of elements...
4801 * since this is a multi-bulk write, but at this stage we don't know
4802 * the intersection set size, so we use a trick, append an empty object
4803 * to the output list and save the pointer to later modify it with the
4804 * right length */
4805 if (!dstkey) {
4806 lenobj = createObject(REDIS_STRING,NULL);
4807 addReply(c,lenobj);
4808 decrRefCount(lenobj);
4809 } else {
4810 /* If we have a target key where to store the resulting set
4811 * create this key with an empty set inside */
4812 dstset = createSetObject();
4813 }
4814
4815 /* Iterate all the elements of the first (smallest) set, and test
4816 * the element against all the other sets, if at least one set does
4817 * not include the element it is discarded */
4818 di = dictGetIterator(dv[0]);
4819
4820 while((de = dictNext(di)) != NULL) {
4821 robj *ele;
4822
4823 for (j = 1; j < setsnum; j++)
4824 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4825 if (j != setsnum)
4826 continue; /* at least one set does not contain the member */
4827 ele = dictGetEntryKey(de);
4828 if (!dstkey) {
4829 addReplyBulk(c,ele);
4830 cardinality++;
4831 } else {
4832 dictAdd(dstset->ptr,ele,NULL);
4833 incrRefCount(ele);
4834 }
4835 }
4836 dictReleaseIterator(di);
4837
4838 if (dstkey) {
4839 /* Store the resulting set into the target, if the intersection
4840 * is not an empty set. */
4841 deleteKey(c->db,dstkey);
4842 if (dictSize((dict*)dstset->ptr) > 0) {
4843 dictAdd(c->db->dict,dstkey,dstset);
4844 incrRefCount(dstkey);
4845 addReplyLong(c,dictSize((dict*)dstset->ptr));
4846 } else {
4847 decrRefCount(dstset);
4848 addReply(c,shared.czero);
4849 }
4850 server.dirty++;
4851 } else {
4852 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4853 }
4854 zfree(dv);
4855 }
4856
4857 static void sinterCommand(redisClient *c) {
4858 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4859 }
4860
4861 static void sinterstoreCommand(redisClient *c) {
4862 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4863 }
4864
4865 #define REDIS_OP_UNION 0
4866 #define REDIS_OP_DIFF 1
4867 #define REDIS_OP_INTER 2
4868
4869 static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4870 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4871 dictIterator *di;
4872 dictEntry *de;
4873 robj *dstset = NULL;
4874 int j, cardinality = 0;
4875
4876 for (j = 0; j < setsnum; j++) {
4877 robj *setobj;
4878
4879 setobj = dstkey ?
4880 lookupKeyWrite(c->db,setskeys[j]) :
4881 lookupKeyRead(c->db,setskeys[j]);
4882 if (!setobj) {
4883 dv[j] = NULL;
4884 continue;
4885 }
4886 if (setobj->type != REDIS_SET) {
4887 zfree(dv);
4888 addReply(c,shared.wrongtypeerr);
4889 return;
4890 }
4891 dv[j] = setobj->ptr;
4892 }
4893
4894 /* We need a temp set object to store our union. If the dstkey
4895 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4896 * this set object will be the resulting object to set into the target key*/
4897 dstset = createSetObject();
4898
4899 /* Iterate all the elements of all the sets, add every element a single
4900 * time to the result set */
4901 for (j = 0; j < setsnum; j++) {
4902 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4903 if (!dv[j]) continue; /* non existing keys are like empty sets */
4904
4905 di = dictGetIterator(dv[j]);
4906
4907 while((de = dictNext(di)) != NULL) {
4908 robj *ele;
4909
4910 /* dictAdd will not add the same element multiple times */
4911 ele = dictGetEntryKey(de);
4912 if (op == REDIS_OP_UNION || j == 0) {
4913 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4914 incrRefCount(ele);
4915 cardinality++;
4916 }
4917 } else if (op == REDIS_OP_DIFF) {
4918 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4919 cardinality--;
4920 }
4921 }
4922 }
4923 dictReleaseIterator(di);
4924
4925 /* result set is empty? Exit asap. */
4926 if (op == REDIS_OP_DIFF && cardinality == 0) break;
4927 }
4928
4929 /* Output the content of the resulting set, if not in STORE mode */
4930 if (!dstkey) {
4931 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4932 di = dictGetIterator(dstset->ptr);
4933 while((de = dictNext(di)) != NULL) {
4934 robj *ele;
4935
4936 ele = dictGetEntryKey(de);
4937 addReplyBulk(c,ele);
4938 }
4939 dictReleaseIterator(di);
4940 decrRefCount(dstset);
4941 } else {
4942 /* If we have a target key where to store the resulting set
4943 * create this key with the result set inside */
4944 deleteKey(c->db,dstkey);
4945 if (dictSize((dict*)dstset->ptr) > 0) {
4946 dictAdd(c->db->dict,dstkey,dstset);
4947 incrRefCount(dstkey);
4948 addReplyLong(c,dictSize((dict*)dstset->ptr));
4949 } else {
4950 decrRefCount(dstset);
4951 addReply(c,shared.czero);
4952 }
4953 server.dirty++;
4954 }
4955 zfree(dv);
4956 }
4957
4958 static void sunionCommand(redisClient *c) {
4959 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
4960 }
4961
4962 static void sunionstoreCommand(redisClient *c) {
4963 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4964 }
4965
4966 static void sdiffCommand(redisClient *c) {
4967 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4968 }
4969
4970 static void sdiffstoreCommand(redisClient *c) {
4971 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
4972 }
4973
4974 /* ==================================== ZSets =============================== */
4975
4976 /* ZSETs are ordered sets using two data structures to hold the same elements
4977 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4978 * data structure.
4979 *
4980 * The elements are added to an hash table mapping Redis objects to scores.
4981 * At the same time the elements are added to a skip list mapping scores
4982 * to Redis objects (so objects are sorted by scores in this "view"). */
4983
4984 /* This skiplist implementation is almost a C translation of the original
4985 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4986 * Alternative to Balanced Trees", modified in three ways:
4987 * a) this implementation allows for repeated values.
4988 * b) the comparison is not just by key (our 'score') but by satellite data.
4989 * c) there is a back pointer, so it's a doubly linked list with the back
4990 * pointers being only at "level 1". This allows to traverse the list
4991 * from tail to head, useful for ZREVRANGE. */
4992
4993 static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4994 zskiplistNode *zn = zmalloc(sizeof(*zn));
4995
4996 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4997 if (level > 0)
4998 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
4999 zn->score = score;
5000 zn->obj = obj;
5001 return zn;
5002 }
5003
5004 static zskiplist *zslCreate(void) {
5005 int j;
5006 zskiplist *zsl;
5007
5008 zsl = zmalloc(sizeof(*zsl));
5009 zsl->level = 1;
5010 zsl->length = 0;
5011 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5012 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5013 zsl->header->forward[j] = NULL;
5014
5015 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5016 if (j < ZSKIPLIST_MAXLEVEL-1)
5017 zsl->header->span[j] = 0;
5018 }
5019 zsl->header->backward = NULL;
5020 zsl->tail = NULL;
5021 return zsl;
5022 }
5023
5024 static void zslFreeNode(zskiplistNode *node) {
5025 decrRefCount(node->obj);
5026 zfree(node->forward);
5027 zfree(node->span);
5028 zfree(node);
5029 }
5030
5031 static void zslFree(zskiplist *zsl) {
5032 zskiplistNode *node = zsl->header->forward[0], *next;
5033
5034 zfree(zsl->header->forward);
5035 zfree(zsl->header->span);
5036 zfree(zsl->header);
5037 while(node) {
5038 next = node->forward[0];
5039 zslFreeNode(node);
5040 node = next;
5041 }
5042 zfree(zsl);
5043 }
5044
5045 static int zslRandomLevel(void) {
5046 int level = 1;
5047 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5048 level += 1;
5049 return level;
5050 }
5051
5052 static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5053 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5054 unsigned int rank[ZSKIPLIST_MAXLEVEL];
5055 int i, level;
5056
5057 x = zsl->header;
5058 for (i = zsl->level-1; i >= 0; i--) {
5059 /* store rank that is crossed to reach the insert position */
5060 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5061
5062 while (x->forward[i] &&
5063 (x->forward[i]->score < score ||
5064 (x->forward[i]->score == score &&
5065 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
5066 rank[i] += i > 0 ? x->span[i-1] : 1;
5067 x = x->forward[i];
5068 }
5069 update[i] = x;
5070 }
5071 /* we assume the key is not already inside, since we allow duplicated
5072 * scores, and the re-insertion of score and redis object should never
5073 * happpen since the caller of zslInsert() should test in the hash table
5074 * if the element is already inside or not. */
5075 level = zslRandomLevel();
5076 if (level > zsl->level) {
5077 for (i = zsl->level; i < level; i++) {
5078 rank[i] = 0;
5079 update[i] = zsl->header;
5080 update[i]->span[i-1] = zsl->length;
5081 }
5082 zsl->level = level;
5083 }
5084 x = zslCreateNode(level,score,obj);
5085 for (i = 0; i < level; i++) {
5086 x->forward[i] = update[i]->forward[i];
5087 update[i]->forward[i] = x;
5088
5089 /* update span covered by update[i] as x is inserted here */
5090 if (i > 0) {
5091 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5092 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5093 }
5094 }
5095
5096 /* increment span for untouched levels */
5097 for (i = level; i < zsl->level; i++) {
5098 update[i]->span[i-1]++;
5099 }
5100
5101 x->backward = (update[0] == zsl->header) ? NULL : update[0];
5102 if (x->forward[0])
5103 x->forward[0]->backward = x;
5104 else
5105 zsl->tail = x;
5106 zsl->length++;
5107 }
5108
5109 /* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5110 void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5111 int i;
5112 for (i = 0; i < zsl->level; i++) {
5113 if (update[i]->forward[i] == x) {
5114 if (i > 0) {
5115 update[i]->span[i-1] += x->span[i-1] - 1;
5116 }
5117 update[i]->forward[i] = x->forward[i];
5118 } else {
5119 /* invariant: i > 0, because update[0]->forward[0]
5120 * is always equal to x */
5121 update[i]->span[i-1] -= 1;
5122 }
5123 }
5124 if (x->forward[0]) {
5125 x->forward[0]->backward = x->backward;
5126 } else {
5127 zsl->tail = x->backward;
5128 }
5129 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5130 zsl->level--;
5131 zsl->length--;
5132 }
5133
5134 /* Delete an element with matching score/object from the skiplist. */
5135 static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5136 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5137 int i;
5138
5139 x = zsl->header;
5140 for (i = zsl->level-1; i >= 0; i--) {
5141 while (x->forward[i] &&
5142 (x->forward[i]->score < score ||
5143 (x->forward[i]->score == score &&
5144 compareStringObjects(x->forward[i]->obj,obj) < 0)))
5145 x = x->forward[i];
5146 update[i] = x;
5147 }
5148 /* We may have multiple elements with the same score, what we need
5149 * is to find the element with both the right score and object. */
5150 x = x->forward[0];
5151 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5152 zslDeleteNode(zsl, x, update);
5153 zslFreeNode(x);
5154 return 1;
5155 } else {
5156 return 0; /* not found */
5157 }
5158 return 0; /* not found */
5159 }
5160
5161 /* Delete all the elements with score between min and max from the skiplist.
5162 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5163 * Note that this function takes the reference to the hash table view of the
5164 * sorted set, in order to remove the elements from the hash table too. */
5165 static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5166 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5167 unsigned long removed = 0;
5168 int i;
5169
5170 x = zsl->header;
5171 for (i = zsl->level-1; i >= 0; i--) {
5172 while (x->forward[i] && x->forward[i]->score < min)
5173 x = x->forward[i];
5174 update[i] = x;
5175 }
5176 /* We may have multiple elements with the same score, what we need
5177 * is to find the element with both the right score and object. */
5178 x = x->forward[0];
5179 while (x && x->score <= max) {
5180 zskiplistNode *next = x->forward[0];
5181 zslDeleteNode(zsl, x, update);
5182 dictDelete(dict,x->obj);
5183 zslFreeNode(x);
5184 removed++;
5185 x = next;
5186 }
5187 return removed; /* not found */
5188 }
5189
5190 /* Delete all the elements with rank between start and end from the skiplist.
5191 * Start and end are inclusive. Note that start and end need to be 1-based */
5192 static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5193 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5194 unsigned long traversed = 0, removed = 0;
5195 int i;
5196
5197 x = zsl->header;
5198 for (i = zsl->level-1; i >= 0; i--) {
5199 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5200 traversed += i > 0 ? x->span[i-1] : 1;
5201 x = x->forward[i];
5202 }
5203 update[i] = x;
5204 }
5205
5206 traversed++;
5207 x = x->forward[0];
5208 while (x && traversed <= end) {
5209 zskiplistNode *next = x->forward[0];
5210 zslDeleteNode(zsl, x, update);
5211 dictDelete(dict,x->obj);
5212 zslFreeNode(x);
5213 removed++;
5214 traversed++;
5215 x = next;
5216 }
5217 return removed;
5218 }
5219
5220 /* Find the first node having a score equal or greater than the specified one.
5221 * Returns NULL if there is no match. */
5222 static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5223 zskiplistNode *x;
5224 int i;
5225
5226 x = zsl->header;
5227 for (i = zsl->level-1; i >= 0; i--) {
5228 while (x->forward[i] && x->forward[i]->score < score)
5229 x = x->forward[i];
5230 }
5231 /* We may have multiple elements with the same score, what we need
5232 * is to find the element with both the right score and object. */
5233 return x->forward[0];
5234 }
5235
5236 /* Find the rank for an element by both score and key.
5237 * Returns 0 when the element cannot be found, rank otherwise.
5238 * Note that the rank is 1-based due to the span of zsl->header to the
5239 * first element. */
5240 static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5241 zskiplistNode *x;
5242 unsigned long rank = 0;
5243 int i;
5244
5245 x = zsl->header;
5246 for (i = zsl->level-1; i >= 0; i--) {
5247 while (x->forward[i] &&
5248 (x->forward[i]->score < score ||
5249 (x->forward[i]->score == score &&
5250 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
5251 rank += i > 0 ? x->span[i-1] : 1;
5252 x = x->forward[i];
5253 }
5254
5255 /* x might be equal to zsl->header, so test if obj is non-NULL */
5256 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5257 return rank;
5258 }
5259 }
5260 return 0;
5261 }
5262
5263 /* Finds an element by its rank. The rank argument needs to be 1-based. */
5264 zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5265 zskiplistNode *x;
5266 unsigned long traversed = 0;
5267 int i;
5268
5269 x = zsl->header;
5270 for (i = zsl->level-1; i >= 0; i--) {
5271 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5272 {
5273 traversed += i > 0 ? x->span[i-1] : 1;
5274 x = x->forward[i];
5275 }
5276 if (traversed == rank) {
5277 return x;
5278 }
5279 }
5280 return NULL;
5281 }
5282
5283 /* The actual Z-commands implementations */
5284
5285 /* This generic command implements both ZADD and ZINCRBY.
5286 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
5287 * the increment if the operation is a ZINCRBY (doincrement == 1). */
5288 static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5289 robj *zsetobj;
5290 zset *zs;
5291 double *score;
5292
5293 zsetobj = lookupKeyWrite(c->db,key);
5294 if (zsetobj == NULL) {
5295 zsetobj = createZsetObject();
5296 dictAdd(c->db->dict,key,zsetobj);
5297 incrRefCount(key);
5298 } else {
5299 if (zsetobj->type != REDIS_ZSET) {
5300 addReply(c,shared.wrongtypeerr);
5301 return;
5302 }
5303 }
5304 zs = zsetobj->ptr;
5305
5306 /* Ok now since we implement both ZADD and ZINCRBY here the code
5307 * needs to handle the two different conditions. It's all about setting
5308 * '*score', that is, the new score to set, to the right value. */
5309 score = zmalloc(sizeof(double));
5310 if (doincrement) {
5311 dictEntry *de;
5312
5313 /* Read the old score. If the element was not present starts from 0 */
5314 de = dictFind(zs->dict,ele);
5315 if (de) {
5316 double *oldscore = dictGetEntryVal(de);
5317 *score = *oldscore + scoreval;
5318 } else {
5319 *score = scoreval;
5320 }
5321 } else {
5322 *score = scoreval;
5323 }
5324
5325 /* What follows is a simple remove and re-insert operation that is common
5326 * to both ZADD and ZINCRBY... */
5327 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5328 /* case 1: New element */
5329 incrRefCount(ele); /* added to hash */
5330 zslInsert(zs->zsl,*score,ele);
5331 incrRefCount(ele); /* added to skiplist */
5332 server.dirty++;
5333 if (doincrement)
5334 addReplyDouble(c,*score);
5335 else
5336 addReply(c,shared.cone);
5337 } else {
5338 dictEntry *de;
5339 double *oldscore;
5340
5341 /* case 2: Score update operation */
5342 de = dictFind(zs->dict,ele);
5343 redisAssert(de != NULL);
5344 oldscore = dictGetEntryVal(de);
5345 if (*score != *oldscore) {
5346 int deleted;
5347
5348 /* Remove and insert the element in the skip list with new score */
5349 deleted = zslDelete(zs->zsl,*oldscore,ele);
5350 redisAssert(deleted != 0);
5351 zslInsert(zs->zsl,*score,ele);
5352 incrRefCount(ele);
5353 /* Update the score in the hash table */
5354 dictReplace(zs->dict,ele,score);
5355 server.dirty++;
5356 } else {
5357 zfree(score);
5358 }
5359 if (doincrement)
5360 addReplyDouble(c,*score);
5361 else
5362 addReply(c,shared.czero);
5363 }
5364 }
5365
5366 static void zaddCommand(redisClient *c) {
5367 double scoreval;
5368
5369 scoreval = strtod(c->argv[2]->ptr,NULL);
5370 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5371 }
5372
5373 static void zincrbyCommand(redisClient *c) {
5374 double scoreval;
5375
5376 scoreval = strtod(c->argv[2]->ptr,NULL);
5377 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5378 }
5379
5380 static void zremCommand(redisClient *c) {
5381 robj *zsetobj;
5382 zset *zs;
5383 dictEntry *de;
5384 double *oldscore;
5385 int deleted;
5386
5387 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5388 checkType(c,zsetobj,REDIS_ZSET)) return;
5389
5390 zs = zsetobj->ptr;
5391 de = dictFind(zs->dict,c->argv[2]);
5392 if (de == NULL) {
5393 addReply(c,shared.czero);
5394 return;
5395 }
5396 /* Delete from the skiplist */
5397 oldscore = dictGetEntryVal(de);
5398 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5399 redisAssert(deleted != 0);
5400
5401 /* Delete from the hash table */
5402 dictDelete(zs->dict,c->argv[2]);
5403 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5404 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5405 server.dirty++;
5406 addReply(c,shared.cone);
5407 }
5408
5409 static void zremrangebyscoreCommand(redisClient *c) {
5410 double min = strtod(c->argv[2]->ptr,NULL);
5411 double max = strtod(c->argv[3]->ptr,NULL);
5412 long deleted;
5413 robj *zsetobj;
5414 zset *zs;
5415
5416 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5417 checkType(c,zsetobj,REDIS_ZSET)) return;
5418
5419 zs = zsetobj->ptr;
5420 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5421 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5422 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5423 server.dirty += deleted;
5424 addReplyLong(c,deleted);
5425 }
5426
5427 static void zremrangebyrankCommand(redisClient *c) {
5428 int start = atoi(c->argv[2]->ptr);
5429 int end = atoi(c->argv[3]->ptr);
5430 int llen;
5431 long deleted;
5432 robj *zsetobj;
5433 zset *zs;
5434
5435 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5436 checkType(c,zsetobj,REDIS_ZSET)) return;
5437 zs = zsetobj->ptr;
5438 llen = zs->zsl->length;
5439
5440 /* convert negative indexes */
5441 if (start < 0) start = llen+start;
5442 if (end < 0) end = llen+end;
5443 if (start < 0) start = 0;
5444 if (end < 0) end = 0;
5445
5446 /* indexes sanity checks */
5447 if (start > end || start >= llen) {
5448 addReply(c,shared.czero);
5449 return;
5450 }
5451 if (end >= llen) end = llen-1;
5452
5453 /* increment start and end because zsl*Rank functions
5454 * use 1-based rank */
5455 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5456 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5457 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
5458 server.dirty += deleted;
5459 addReplyLong(c, deleted);
5460 }
5461
5462 typedef struct {
5463 dict *dict;
5464 double weight;
5465 } zsetopsrc;
5466
5467 static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5468 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5469 unsigned long size1, size2;
5470 size1 = d1->dict ? dictSize(d1->dict) : 0;
5471 size2 = d2->dict ? dictSize(d2->dict) : 0;
5472 return size1 - size2;
5473 }
5474
5475 #define REDIS_AGGR_SUM 1
5476 #define REDIS_AGGR_MIN 2
5477 #define REDIS_AGGR_MAX 3
5478
5479 inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5480 if (aggregate == REDIS_AGGR_SUM) {
5481 *target = *target + val;
5482 } else if (aggregate == REDIS_AGGR_MIN) {
5483 *target = val < *target ? val : *target;
5484 } else if (aggregate == REDIS_AGGR_MAX) {
5485 *target = val > *target ? val : *target;
5486 } else {
5487 /* safety net */
5488 redisAssert(0 != 0);
5489 }
5490 }
5491
5492 static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5493 int i, j, zsetnum;
5494 int aggregate = REDIS_AGGR_SUM;
5495 zsetopsrc *src;
5496 robj *dstobj;
5497 zset *dstzset;
5498 dictIterator *di;
5499 dictEntry *de;
5500
5501 /* expect zsetnum input keys to be given */
5502 zsetnum = atoi(c->argv[2]->ptr);
5503 if (zsetnum < 1) {
5504 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5505 return;
5506 }
5507
5508 /* test if the expected number of keys would overflow */
5509 if (3+zsetnum > c->argc) {
5510 addReply(c,shared.syntaxerr);
5511 return;
5512 }
5513
5514 /* read keys to be used for input */
5515 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5516 for (i = 0, j = 3; i < zsetnum; i++, j++) {
5517 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5518 if (!zsetobj) {
5519 src[i].dict = NULL;
5520 } else {
5521 if (zsetobj->type != REDIS_ZSET) {
5522 zfree(src);
5523 addReply(c,shared.wrongtypeerr);
5524 return;
5525 }
5526 src[i].dict = ((zset*)zsetobj->ptr)->dict;
5527 }
5528
5529 /* default all weights to 1 */
5530 src[i].weight = 1.0;
5531 }
5532
5533 /* parse optional extra arguments */
5534 if (j < c->argc) {
5535 int remaining = c->argc - j;
5536
5537 while (remaining) {
5538 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5539 j++; remaining--;
5540 for (i = 0; i < zsetnum; i++, j++, remaining--) {
5541 src[i].weight = strtod(c->argv[j]->ptr, NULL);
5542 }
5543 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5544 j++; remaining--;
5545 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5546 aggregate = REDIS_AGGR_SUM;
5547 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5548 aggregate = REDIS_AGGR_MIN;
5549 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5550 aggregate = REDIS_AGGR_MAX;
5551 } else {
5552 zfree(src);
5553 addReply(c,shared.syntaxerr);
5554 return;
5555 }
5556 j++; remaining--;
5557 } else {
5558 zfree(src);
5559 addReply(c,shared.syntaxerr);
5560 return;
5561 }
5562 }
5563 }
5564
5565 /* sort sets from the smallest to largest, this will improve our
5566 * algorithm's performance */
5567 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5568
5569 dstobj = createZsetObject();
5570 dstzset = dstobj->ptr;
5571
5572 if (op == REDIS_OP_INTER) {
5573 /* skip going over all entries if the smallest zset is NULL or empty */
5574 if (src[0].dict && dictSize(src[0].dict) > 0) {
5575 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5576 * from small to large, all src[i > 0].dict are non-empty too */
5577 di = dictGetIterator(src[0].dict);
5578 while((de = dictNext(di)) != NULL) {
5579 double *score = zmalloc(sizeof(double)), value;
5580 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5581
5582 for (j = 1; j < zsetnum; j++) {
5583 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5584 if (other) {
5585 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5586 zunionInterAggregate(score, value, aggregate);
5587 } else {
5588 break;
5589 }
5590 }
5591
5592 /* skip entry when not present in every source dict */
5593 if (j != zsetnum) {
5594 zfree(score);
5595 } else {
5596 robj *o = dictGetEntryKey(de);
5597 dictAdd(dstzset->dict,o,score);
5598 incrRefCount(o); /* added to dictionary */
5599 zslInsert(dstzset->zsl,*score,o);
5600 incrRefCount(o); /* added to skiplist */
5601 }
5602 }
5603 dictReleaseIterator(di);
5604 }
5605 } else if (op == REDIS_OP_UNION) {
5606 for (i = 0; i < zsetnum; i++) {
5607 if (!src[i].dict) continue;
5608
5609 di = dictGetIterator(src[i].dict);
5610 while((de = dictNext(di)) != NULL) {
5611 /* skip key when already processed */
5612 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5613
5614 double *score = zmalloc(sizeof(double)), value;
5615 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5616
5617 /* because the zsets are sorted by size, its only possible
5618 * for sets at larger indices to hold this entry */
5619 for (j = (i+1); j < zsetnum; j++) {
5620 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5621 if (other) {
5622 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5623 zunionInterAggregate(score, value, aggregate);
5624 }
5625 }
5626
5627 robj *o = dictGetEntryKey(de);
5628 dictAdd(dstzset->dict,o,score);
5629 incrRefCount(o); /* added to dictionary */
5630 zslInsert(dstzset->zsl,*score,o);
5631 incrRefCount(o); /* added to skiplist */
5632 }
5633 dictReleaseIterator(di);
5634 }
5635 } else {
5636 /* unknown operator */
5637 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
5638 }
5639
5640 deleteKey(c->db,dstkey);
5641 if (dstzset->zsl->length) {
5642 dictAdd(c->db->dict,dstkey,dstobj);
5643 incrRefCount(dstkey);
5644 addReplyLong(c, dstzset->zsl->length);
5645 server.dirty++;
5646 } else {
5647 decrRefCount(dstzset);
5648 addReply(c, shared.czero);
5649 }
5650 zfree(src);
5651 }
5652
5653 static void zunionCommand(redisClient *c) {
5654 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
5655 }
5656
5657 static void zinterCommand(redisClient *c) {
5658 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
5659 }
5660
5661 static void zrangeGenericCommand(redisClient *c, int reverse) {
5662 robj *o;
5663 int start = atoi(c->argv[2]->ptr);
5664 int end = atoi(c->argv[3]->ptr);
5665 int withscores = 0;
5666 int llen;
5667 int rangelen, j;
5668 zset *zsetobj;
5669 zskiplist *zsl;
5670 zskiplistNode *ln;
5671 robj *ele;
5672
5673 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5674 withscores = 1;
5675 } else if (c->argc >= 5) {
5676 addReply(c,shared.syntaxerr);
5677 return;
5678 }
5679
5680 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5681 checkType(c,o,REDIS_ZSET)) return;
5682 zsetobj = o->ptr;
5683 zsl = zsetobj->zsl;
5684 llen = zsl->length;
5685
5686 /* convert negative indexes */
5687 if (start < 0) start = llen+start;
5688 if (end < 0) end = llen+end;
5689 if (start < 0) start = 0;
5690 if (end < 0) end = 0;
5691
5692 /* indexes sanity checks */
5693 if (start > end || start >= llen) {
5694 /* Out of range start or start > end result in empty list */
5695 addReply(c,shared.emptymultibulk);
5696 return;
5697 }
5698 if (end >= llen) end = llen-1;
5699 rangelen = (end-start)+1;
5700
5701 /* check if starting point is trivial, before searching
5702 * the element in log(N) time */
5703 if (reverse) {
5704 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5705 } else {
5706 ln = start == 0 ?
5707 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5708 }
5709
5710 /* Return the result in form of a multi-bulk reply */
5711 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5712 withscores ? (rangelen*2) : rangelen));
5713 for (j = 0; j < rangelen; j++) {
5714 ele = ln->obj;
5715 addReplyBulk(c,ele);
5716 if (withscores)
5717 addReplyDouble(c,ln->score);
5718 ln = reverse ? ln->backward : ln->forward[0];
5719 }
5720 }
5721
5722 static void zrangeCommand(redisClient *c) {
5723 zrangeGenericCommand(c,0);
5724 }
5725
5726 static void zrevrangeCommand(redisClient *c) {
5727 zrangeGenericCommand(c,1);
5728 }
5729
5730 /* This command implements both ZRANGEBYSCORE and ZCOUNT.
5731 * If justcount is non-zero, just the count is returned. */
5732 static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5733 robj *o;
5734 double min, max;
5735 int minex = 0, maxex = 0; /* are min or max exclusive? */
5736 int offset = 0, limit = -1;
5737 int withscores = 0;
5738 int badsyntax = 0;
5739
5740 /* Parse the min-max interval. If one of the values is prefixed
5741 * by the "(" character, it's considered "open". For instance
5742 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5743 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5744 if (((char*)c->argv[2]->ptr)[0] == '(') {
5745 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5746 minex = 1;
5747 } else {
5748 min = strtod(c->argv[2]->ptr,NULL);
5749 }
5750 if (((char*)c->argv[3]->ptr)[0] == '(') {
5751 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5752 maxex = 1;
5753 } else {
5754 max = strtod(c->argv[3]->ptr,NULL);
5755 }
5756
5757 /* Parse "WITHSCORES": note that if the command was called with
5758 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5759 * enter the following paths to parse WITHSCORES and LIMIT. */
5760 if (c->argc == 5 || c->argc == 8) {
5761 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5762 withscores = 1;
5763 else
5764 badsyntax = 1;
5765 }
5766 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5767 badsyntax = 1;
5768 if (badsyntax) {
5769 addReplySds(c,
5770 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5771 return;
5772 }
5773
5774 /* Parse "LIMIT" */
5775 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5776 addReply(c,shared.syntaxerr);
5777 return;
5778 } else if (c->argc == (7 + withscores)) {
5779 offset = atoi(c->argv[5]->ptr);
5780 limit = atoi(c->argv[6]->ptr);
5781 if (offset < 0) offset = 0;
5782 }
5783
5784 /* Ok, lookup the key and get the range */
5785 o = lookupKeyRead(c->db,c->argv[1]);
5786 if (o == NULL) {
5787 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
5788 } else {
5789 if (o->type != REDIS_ZSET) {
5790 addReply(c,shared.wrongtypeerr);
5791 } else {
5792 zset *zsetobj = o->ptr;
5793 zskiplist *zsl = zsetobj->zsl;
5794 zskiplistNode *ln;
5795 robj *ele, *lenobj = NULL;
5796 unsigned long rangelen = 0;
5797
5798 /* Get the first node with the score >= min, or with
5799 * score > min if 'minex' is true. */
5800 ln = zslFirstWithScore(zsl,min);
5801 while (minex && ln && ln->score == min) ln = ln->forward[0];
5802
5803 if (ln == NULL) {
5804 /* No element matching the speciifed interval */
5805 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5806 return;
5807 }
5808
5809 /* We don't know in advance how many matching elements there
5810 * are in the list, so we push this object that will represent
5811 * the multi-bulk length in the output buffer, and will "fix"
5812 * it later */
5813 if (!justcount) {
5814 lenobj = createObject(REDIS_STRING,NULL);
5815 addReply(c,lenobj);
5816 decrRefCount(lenobj);
5817 }
5818
5819 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5820 if (offset) {
5821 offset--;
5822 ln = ln->forward[0];
5823 continue;
5824 }
5825 if (limit == 0) break;
5826 if (!justcount) {
5827 ele = ln->obj;
5828 addReplyBulk(c,ele);
5829 if (withscores)
5830 addReplyDouble(c,ln->score);
5831 }
5832 ln = ln->forward[0];
5833 rangelen++;
5834 if (limit > 0) limit--;
5835 }
5836 if (justcount) {
5837 addReplyLong(c,(long)rangelen);
5838 } else {
5839 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5840 withscores ? (rangelen*2) : rangelen);
5841 }
5842 }
5843 }
5844 }
5845
5846 static void zrangebyscoreCommand(redisClient *c) {
5847 genericZrangebyscoreCommand(c,0);
5848 }
5849
5850 static void zcountCommand(redisClient *c) {
5851 genericZrangebyscoreCommand(c,1);
5852 }
5853
5854 static void zcardCommand(redisClient *c) {
5855 robj *o;
5856 zset *zs;
5857
5858 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5859 checkType(c,o,REDIS_ZSET)) return;
5860
5861 zs = o->ptr;
5862 addReplyUlong(c,zs->zsl->length);
5863 }
5864
5865 static void zscoreCommand(redisClient *c) {
5866 robj *o;
5867 zset *zs;
5868 dictEntry *de;
5869
5870 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5871 checkType(c,o,REDIS_ZSET)) return;
5872
5873 zs = o->ptr;
5874 de = dictFind(zs->dict,c->argv[2]);
5875 if (!de) {
5876 addReply(c,shared.nullbulk);
5877 } else {
5878 double *score = dictGetEntryVal(de);
5879
5880 addReplyDouble(c,*score);
5881 }
5882 }
5883
5884 static void zrankGenericCommand(redisClient *c, int reverse) {
5885 robj *o;
5886 zset *zs;
5887 zskiplist *zsl;
5888 dictEntry *de;
5889 unsigned long rank;
5890 double *score;
5891
5892 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5893 checkType(c,o,REDIS_ZSET)) return;
5894
5895 zs = o->ptr;
5896 zsl = zs->zsl;
5897 de = dictFind(zs->dict,c->argv[2]);
5898 if (!de) {
5899 addReply(c,shared.nullbulk);
5900 return;
5901 }
5902
5903 score = dictGetEntryVal(de);
5904 rank = zslGetRank(zsl, *score, c->argv[2]);
5905 if (rank) {
5906 if (reverse) {
5907 addReplyLong(c, zsl->length - rank);
5908 } else {
5909 addReplyLong(c, rank-1);
5910 }
5911 } else {
5912 addReply(c,shared.nullbulk);
5913 }
5914 }
5915
5916 static void zrankCommand(redisClient *c) {
5917 zrankGenericCommand(c, 0);
5918 }
5919
5920 static void zrevrankCommand(redisClient *c) {
5921 zrankGenericCommand(c, 1);
5922 }
5923
5924 /* =================================== Hashes =============================== */
5925 static void hsetCommand(redisClient *c) {
5926 int update = 0;
5927 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5928
5929 if (o == NULL) {
5930 o = createHashObject();
5931 dictAdd(c->db->dict,c->argv[1],o);
5932 incrRefCount(c->argv[1]);
5933 } else {
5934 if (o->type != REDIS_HASH) {
5935 addReply(c,shared.wrongtypeerr);
5936 return;
5937 }
5938 }
5939 /* We want to convert the zipmap into an hash table right now if the
5940 * entry to be added is too big. Note that we check if the object
5941 * is integer encoded before to try fetching the length in the test below.
5942 * This is because integers are small, but currently stringObjectLen()
5943 * performs a slow conversion: not worth it. */
5944 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5945 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5946 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5947 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5948 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5949 {
5950 convertToRealHash(o);
5951 }
5952
5953 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5954 unsigned char *zm = o->ptr;
5955 robj *valobj = getDecodedObject(c->argv[3]);
5956
5957 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
5958 valobj->ptr,sdslen(valobj->ptr),&update);
5959 decrRefCount(valobj);
5960 o->ptr = zm;
5961
5962 /* And here there is the second check for hash conversion...
5963 * we want to do it only if the operation was not just an update as
5964 * zipmapLen() is O(N). */
5965 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
5966 convertToRealHash(o);
5967 } else {
5968 tryObjectEncoding(c->argv[2]);
5969 /* note that c->argv[3] is already encoded, as the latest arg
5970 * of a bulk command is always integer encoded if possible. */
5971 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
5972 incrRefCount(c->argv[2]);
5973 } else {
5974 update = 1;
5975 }
5976 incrRefCount(c->argv[3]);
5977 }
5978 server.dirty++;
5979 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5980 }
5981
5982 static void hincrbyCommand(redisClient *c) {
5983 int update = 0;
5984 long long value = 0, incr = 0;
5985 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5986
5987 if (o == NULL) {
5988 o = createHashObject();
5989 dictAdd(c->db->dict,c->argv[1],o);
5990 incrRefCount(c->argv[1]);
5991 } else {
5992 if (o->type != REDIS_HASH) {
5993 addReply(c,shared.wrongtypeerr);
5994 return;
5995 }
5996 }
5997
5998 robj *o_incr = getDecodedObject(c->argv[3]);
5999 incr = strtoll(o_incr->ptr, NULL, 10);
6000 decrRefCount(o_incr);
6001
6002 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6003 unsigned char *zm = o->ptr;
6004 unsigned char *zval;
6005 unsigned int zvlen;
6006
6007 /* Find value if already present in hash */
6008 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6009 &zval,&zvlen)) {
6010 /* strtoll needs the char* to have a trailing \0, but
6011 * the zipmap doesn't include them. */
6012 sds szval = sdsnewlen(zval, zvlen);
6013 value = strtoll(szval,NULL,10);
6014 sdsfree(szval);
6015 }
6016
6017 value += incr;
6018 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6019 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6020 (unsigned char*)svalue,sdslen(svalue),&update);
6021 sdsfree(svalue);
6022 o->ptr = zm;
6023
6024 /* Check if the zipmap needs to be converted
6025 * if this was not an update. */
6026 if (!update && zipmapLen(zm) > server.hash_max_zipmap_entries)
6027 convertToRealHash(o);
6028 } else {
6029 robj *hval;
6030 dictEntry *de;
6031
6032 /* Find value if already present in hash */
6033 de = dictFind(o->ptr,c->argv[2]);
6034 if (de != NULL) {
6035 hval = dictGetEntryVal(de);
6036 if (hval->encoding == REDIS_ENCODING_RAW)
6037 value = strtoll(hval->ptr,NULL,10);
6038 else if (hval->encoding == REDIS_ENCODING_INT)
6039 value = (long)hval->ptr;
6040 else
6041 redisAssert(1 != 1);
6042 }
6043
6044 value += incr;
6045 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6046 tryObjectEncoding(hval);
6047 if (dictReplace(o->ptr,c->argv[2],hval)) {
6048 incrRefCount(c->argv[2]);
6049 }
6050 }
6051
6052 server.dirty++;
6053 addReplyLong(c, value);
6054 }
6055
6056 static void hgetCommand(redisClient *c) {
6057 robj *o;
6058
6059 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6060 checkType(c,o,REDIS_HASH)) return;
6061
6062 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6063 unsigned char *zm = o->ptr;
6064 unsigned char *val;
6065 unsigned int vlen;
6066 robj *field;
6067
6068 field = getDecodedObject(c->argv[2]);
6069 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
6070 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6071 addReplySds(c,sdsnewlen(val,vlen));
6072 addReply(c,shared.crlf);
6073 decrRefCount(field);
6074 return;
6075 } else {
6076 addReply(c,shared.nullbulk);
6077 decrRefCount(field);
6078 return;
6079 }
6080 } else {
6081 struct dictEntry *de;
6082
6083 de = dictFind(o->ptr,c->argv[2]);
6084 if (de == NULL) {
6085 addReply(c,shared.nullbulk);
6086 } else {
6087 robj *e = dictGetEntryVal(de);
6088
6089 addReplyBulk(c,e);
6090 }
6091 }
6092 }
6093
6094 static void hdelCommand(redisClient *c) {
6095 robj *o;
6096 int deleted = 0;
6097
6098 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6099 checkType(c,o,REDIS_HASH)) return;
6100
6101 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6102 robj *field = getDecodedObject(c->argv[2]);
6103
6104 o->ptr = zipmapDel((unsigned char*) o->ptr,
6105 (unsigned char*) field->ptr,
6106 sdslen(field->ptr), &deleted);
6107 decrRefCount(field);
6108 if (zipmapLen((unsigned char*) o->ptr) == 0)
6109 deleteKey(c->db,c->argv[1]);
6110 } else {
6111 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6112 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6113 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6114 }
6115 if (deleted) server.dirty++;
6116 addReply(c,deleted ? shared.cone : shared.czero);
6117 }
6118
6119 static void hlenCommand(redisClient *c) {
6120 robj *o;
6121 unsigned long len;
6122
6123 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6124 checkType(c,o,REDIS_HASH)) return;
6125
6126 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6127 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6128 addReplyUlong(c,len);
6129 }
6130
6131 #define REDIS_GETALL_KEYS 1
6132 #define REDIS_GETALL_VALS 2
6133 static void genericHgetallCommand(redisClient *c, int flags) {
6134 robj *o, *lenobj;
6135 unsigned long count = 0;
6136
6137 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6138 || checkType(c,o,REDIS_HASH)) return;
6139
6140 lenobj = createObject(REDIS_STRING,NULL);
6141 addReply(c,lenobj);
6142 decrRefCount(lenobj);
6143
6144 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6145 unsigned char *p = zipmapRewind(o->ptr);
6146 unsigned char *field, *val;
6147 unsigned int flen, vlen;
6148
6149 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6150 robj *aux;
6151
6152 if (flags & REDIS_GETALL_KEYS) {
6153 aux = createStringObject((char*)field,flen);
6154 addReplyBulk(c,aux);
6155 decrRefCount(aux);
6156 count++;
6157 }
6158 if (flags & REDIS_GETALL_VALS) {
6159 aux = createStringObject((char*)val,vlen);
6160 addReplyBulk(c,aux);
6161 decrRefCount(aux);
6162 count++;
6163 }
6164 }
6165 } else {
6166 dictIterator *di = dictGetIterator(o->ptr);
6167 dictEntry *de;
6168
6169 while((de = dictNext(di)) != NULL) {
6170 robj *fieldobj = dictGetEntryKey(de);
6171 robj *valobj = dictGetEntryVal(de);
6172
6173 if (flags & REDIS_GETALL_KEYS) {
6174 addReplyBulk(c,fieldobj);
6175 count++;
6176 }
6177 if (flags & REDIS_GETALL_VALS) {
6178 addReplyBulk(c,valobj);
6179 count++;
6180 }
6181 }
6182 dictReleaseIterator(di);
6183 }
6184 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6185 }
6186
6187 static void hkeysCommand(redisClient *c) {
6188 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6189 }
6190
6191 static void hvalsCommand(redisClient *c) {
6192 genericHgetallCommand(c,REDIS_GETALL_VALS);
6193 }
6194
6195 static void hgetallCommand(redisClient *c) {
6196 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6197 }
6198
6199 static void hexistsCommand(redisClient *c) {
6200 robj *o;
6201 int exists = 0;
6202
6203 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6204 checkType(c,o,REDIS_HASH)) return;
6205
6206 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6207 robj *field;
6208 unsigned char *zm = o->ptr;
6209
6210 field = getDecodedObject(c->argv[2]);
6211 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6212 decrRefCount(field);
6213 } else {
6214 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6215 }
6216 addReply(c,exists ? shared.cone : shared.czero);
6217 }
6218
6219 static void convertToRealHash(robj *o) {
6220 unsigned char *key, *val, *p, *zm = o->ptr;
6221 unsigned int klen, vlen;
6222 dict *dict = dictCreate(&hashDictType,NULL);
6223
6224 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6225 p = zipmapRewind(zm);
6226 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6227 robj *keyobj, *valobj;
6228
6229 keyobj = createStringObject((char*)key,klen);
6230 valobj = createStringObject((char*)val,vlen);
6231 tryObjectEncoding(keyobj);
6232 tryObjectEncoding(valobj);
6233 dictAdd(dict,keyobj,valobj);
6234 }
6235 o->encoding = REDIS_ENCODING_HT;
6236 o->ptr = dict;
6237 zfree(zm);
6238 }
6239
6240 /* ========================= Non type-specific commands ==================== */
6241
6242 static void flushdbCommand(redisClient *c) {
6243 server.dirty += dictSize(c->db->dict);
6244 dictEmpty(c->db->dict);
6245 dictEmpty(c->db->expires);
6246 addReply(c,shared.ok);
6247 }
6248
6249 static void flushallCommand(redisClient *c) {
6250 server.dirty += emptyDb();
6251 addReply(c,shared.ok);
6252 if (server.bgsavechildpid != -1) {
6253 kill(server.bgsavechildpid,SIGKILL);
6254 rdbRemoveTempFile(server.bgsavechildpid);
6255 }
6256 rdbSave(server.dbfilename);
6257 server.dirty++;
6258 }
6259
6260 static redisSortOperation *createSortOperation(int type, robj *pattern) {
6261 redisSortOperation *so = zmalloc(sizeof(*so));
6262 so->type = type;
6263 so->pattern = pattern;
6264 return so;
6265 }
6266
6267 /* Return the value associated to the key with a name obtained
6268 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6269 static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6270 char *p;
6271 sds spat, ssub;
6272 robj keyobj;
6273 int prefixlen, sublen, postfixlen;
6274 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6275 struct {
6276 long len;
6277 long free;
6278 char buf[REDIS_SORTKEY_MAX+1];
6279 } keyname;
6280
6281 /* If the pattern is "#" return the substitution object itself in order
6282 * to implement the "SORT ... GET #" feature. */
6283 spat = pattern->ptr;
6284 if (spat[0] == '#' && spat[1] == '\0') {
6285 return subst;
6286 }
6287
6288 /* The substitution object may be specially encoded. If so we create
6289 * a decoded object on the fly. Otherwise getDecodedObject will just
6290 * increment the ref count, that we'll decrement later. */
6291 subst = getDecodedObject(subst);
6292
6293 ssub = subst->ptr;
6294 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6295 p = strchr(spat,'*');
6296 if (!p) {
6297 decrRefCount(subst);
6298 return NULL;
6299 }
6300
6301 prefixlen = p-spat;
6302 sublen = sdslen(ssub);
6303 postfixlen = sdslen(spat)-(prefixlen+1);
6304 memcpy(keyname.buf,spat,prefixlen);
6305 memcpy(keyname.buf+prefixlen,ssub,sublen);
6306 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6307 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6308 keyname.len = prefixlen+sublen+postfixlen;
6309
6310 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6311 decrRefCount(subst);
6312
6313 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
6314 return lookupKeyRead(db,&keyobj);
6315 }
6316
6317 /* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6318 * the additional parameter is not standard but a BSD-specific we have to
6319 * pass sorting parameters via the global 'server' structure */
6320 static int sortCompare(const void *s1, const void *s2) {
6321 const redisSortObject *so1 = s1, *so2 = s2;
6322 int cmp;
6323
6324 if (!server.sort_alpha) {
6325 /* Numeric sorting. Here it's trivial as we precomputed scores */
6326 if (so1->u.score > so2->u.score) {
6327 cmp = 1;
6328 } else if (so1->u.score < so2->u.score) {
6329 cmp = -1;
6330 } else {
6331 cmp = 0;
6332 }
6333 } else {
6334 /* Alphanumeric sorting */
6335 if (server.sort_bypattern) {
6336 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6337 /* At least one compare object is NULL */
6338 if (so1->u.cmpobj == so2->u.cmpobj)
6339 cmp = 0;
6340 else if (so1->u.cmpobj == NULL)
6341 cmp = -1;
6342 else
6343 cmp = 1;
6344 } else {
6345 /* We have both the objects, use strcoll */
6346 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6347 }
6348 } else {
6349 /* Compare elements directly */
6350 robj *dec1, *dec2;
6351
6352 dec1 = getDecodedObject(so1->obj);
6353 dec2 = getDecodedObject(so2->obj);
6354 cmp = strcoll(dec1->ptr,dec2->ptr);
6355 decrRefCount(dec1);
6356 decrRefCount(dec2);
6357 }
6358 }
6359 return server.sort_desc ? -cmp : cmp;
6360 }
6361
6362 /* The SORT command is the most complex command in Redis. Warning: this code
6363 * is optimized for speed and a bit less for readability */
6364 static void sortCommand(redisClient *c) {
6365 list *operations;
6366 int outputlen = 0;
6367 int desc = 0, alpha = 0;
6368 int limit_start = 0, limit_count = -1, start, end;
6369 int j, dontsort = 0, vectorlen;
6370 int getop = 0; /* GET operation counter */
6371 robj *sortval, *sortby = NULL, *storekey = NULL;
6372 redisSortObject *vector; /* Resulting vector to sort */
6373
6374 /* Lookup the key to sort. It must be of the right types */
6375 sortval = lookupKeyRead(c->db,c->argv[1]);
6376 if (sortval == NULL) {
6377 addReply(c,shared.nullmultibulk);
6378 return;
6379 }
6380 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6381 sortval->type != REDIS_ZSET)
6382 {
6383 addReply(c,shared.wrongtypeerr);
6384 return;
6385 }
6386
6387 /* Create a list of operations to perform for every sorted element.
6388 * Operations can be GET/DEL/INCR/DECR */
6389 operations = listCreate();
6390 listSetFreeMethod(operations,zfree);
6391 j = 2;
6392
6393 /* Now we need to protect sortval incrementing its count, in the future
6394 * SORT may have options able to overwrite/delete keys during the sorting
6395 * and the sorted key itself may get destroied */
6396 incrRefCount(sortval);
6397
6398 /* The SORT command has an SQL-alike syntax, parse it */
6399 while(j < c->argc) {
6400 int leftargs = c->argc-j-1;
6401 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6402 desc = 0;
6403 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6404 desc = 1;
6405 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6406 alpha = 1;
6407 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6408 limit_start = atoi(c->argv[j+1]->ptr);
6409 limit_count = atoi(c->argv[j+2]->ptr);
6410 j+=2;
6411 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6412 storekey = c->argv[j+1];
6413 j++;
6414 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6415 sortby = c->argv[j+1];
6416 /* If the BY pattern does not contain '*', i.e. it is constant,
6417 * we don't need to sort nor to lookup the weight keys. */
6418 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6419 j++;
6420 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6421 listAddNodeTail(operations,createSortOperation(
6422 REDIS_SORT_GET,c->argv[j+1]));
6423 getop++;
6424 j++;
6425 } else {
6426 decrRefCount(sortval);
6427 listRelease(operations);
6428 addReply(c,shared.syntaxerr);
6429 return;
6430 }
6431 j++;
6432 }
6433
6434 /* Load the sorting vector with all the objects to sort */
6435 switch(sortval->type) {
6436 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6437 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6438 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6439 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
6440 }
6441 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
6442 j = 0;
6443
6444 if (sortval->type == REDIS_LIST) {
6445 list *list = sortval->ptr;
6446 listNode *ln;
6447 listIter li;
6448
6449 listRewind(list,&li);
6450 while((ln = listNext(&li))) {
6451 robj *ele = ln->value;
6452 vector[j].obj = ele;
6453 vector[j].u.score = 0;
6454 vector[j].u.cmpobj = NULL;
6455 j++;
6456 }
6457 } else {
6458 dict *set;
6459 dictIterator *di;
6460 dictEntry *setele;
6461
6462 if (sortval->type == REDIS_SET) {
6463 set = sortval->ptr;
6464 } else {
6465 zset *zs = sortval->ptr;
6466 set = zs->dict;
6467 }
6468
6469 di = dictGetIterator(set);
6470 while((setele = dictNext(di)) != NULL) {
6471 vector[j].obj = dictGetEntryKey(setele);
6472 vector[j].u.score = 0;
6473 vector[j].u.cmpobj = NULL;
6474 j++;
6475 }
6476 dictReleaseIterator(di);
6477 }
6478 redisAssert(j == vectorlen);
6479
6480 /* Now it's time to load the right scores in the sorting vector */
6481 if (dontsort == 0) {
6482 for (j = 0; j < vectorlen; j++) {
6483 if (sortby) {
6484 robj *byval;
6485
6486 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
6487 if (!byval || byval->type != REDIS_STRING) continue;
6488 if (alpha) {
6489 vector[j].u.cmpobj = getDecodedObject(byval);
6490 } else {
6491 if (byval->encoding == REDIS_ENCODING_RAW) {
6492 vector[j].u.score = strtod(byval->ptr,NULL);
6493 } else {
6494 /* Don't need to decode the object if it's
6495 * integer-encoded (the only encoding supported) so
6496 * far. We can just cast it */
6497 if (byval->encoding == REDIS_ENCODING_INT) {
6498 vector[j].u.score = (long)byval->ptr;
6499 } else
6500 redisAssert(1 != 1);
6501 }
6502 }
6503 } else {
6504 if (!alpha) {
6505 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6506 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6507 else {
6508 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6509 vector[j].u.score = (long) vector[j].obj->ptr;
6510 else
6511 redisAssert(1 != 1);
6512 }
6513 }
6514 }
6515 }
6516 }
6517
6518 /* We are ready to sort the vector... perform a bit of sanity check
6519 * on the LIMIT option too. We'll use a partial version of quicksort. */
6520 start = (limit_start < 0) ? 0 : limit_start;
6521 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6522 if (start >= vectorlen) {
6523 start = vectorlen-1;
6524 end = vectorlen-2;
6525 }
6526 if (end >= vectorlen) end = vectorlen-1;
6527
6528 if (dontsort == 0) {
6529 server.sort_desc = desc;
6530 server.sort_alpha = alpha;
6531 server.sort_bypattern = sortby ? 1 : 0;
6532 if (sortby && (start != 0 || end != vectorlen-1))
6533 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6534 else
6535 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
6536 }
6537
6538 /* Send command output to the output buffer, performing the specified
6539 * GET/DEL/INCR/DECR operations if any. */
6540 outputlen = getop ? getop*(end-start+1) : end-start+1;
6541 if (storekey == NULL) {
6542 /* STORE option not specified, sent the sorting result to client */
6543 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6544 for (j = start; j <= end; j++) {
6545 listNode *ln;
6546 listIter li;
6547
6548 if (!getop) addReplyBulk(c,vector[j].obj);
6549 listRewind(operations,&li);
6550 while((ln = listNext(&li))) {
6551 redisSortOperation *sop = ln->value;
6552 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6553 vector[j].obj);
6554
6555 if (sop->type == REDIS_SORT_GET) {
6556 if (!val || val->type != REDIS_STRING) {
6557 addReply(c,shared.nullbulk);
6558 } else {
6559 addReplyBulk(c,val);
6560 }
6561 } else {
6562 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6563 }
6564 }
6565 }
6566 } else {
6567 robj *listObject = createListObject();
6568 list *listPtr = (list*) listObject->ptr;
6569
6570 /* STORE option specified, set the sorting result as a List object */
6571 for (j = start; j <= end; j++) {
6572 listNode *ln;
6573 listIter li;
6574
6575 if (!getop) {
6576 listAddNodeTail(listPtr,vector[j].obj);
6577 incrRefCount(vector[j].obj);
6578 }
6579 listRewind(operations,&li);
6580 while((ln = listNext(&li))) {
6581 redisSortOperation *sop = ln->value;
6582 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6583 vector[j].obj);
6584
6585 if (sop->type == REDIS_SORT_GET) {
6586 if (!val || val->type != REDIS_STRING) {
6587 listAddNodeTail(listPtr,createStringObject("",0));
6588 } else {
6589 listAddNodeTail(listPtr,val);
6590 incrRefCount(val);
6591 }
6592 } else {
6593 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
6594 }
6595 }
6596 }
6597 if (dictReplace(c->db->dict,storekey,listObject)) {
6598 incrRefCount(storekey);
6599 }
6600 /* Note: we add 1 because the DB is dirty anyway since even if the
6601 * SORT result is empty a new key is set and maybe the old content
6602 * replaced. */
6603 server.dirty += 1+outputlen;
6604 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
6605 }
6606
6607 /* Cleanup */
6608 decrRefCount(sortval);
6609 listRelease(operations);
6610 for (j = 0; j < vectorlen; j++) {
6611 if (sortby && alpha && vector[j].u.cmpobj)
6612 decrRefCount(vector[j].u.cmpobj);
6613 }
6614 zfree(vector);
6615 }
6616
6617 /* Convert an amount of bytes into a human readable string in the form
6618 * of 100B, 2G, 100M, 4K, and so forth. */
6619 static void bytesToHuman(char *s, unsigned long long n) {
6620 double d;
6621
6622 if (n < 1024) {
6623 /* Bytes */
6624 sprintf(s,"%lluB",n);
6625 return;
6626 } else if (n < (1024*1024)) {
6627 d = (double)n/(1024);
6628 sprintf(s,"%.2fK",d);
6629 } else if (n < (1024LL*1024*1024)) {
6630 d = (double)n/(1024*1024);
6631 sprintf(s,"%.2fM",d);
6632 } else if (n < (1024LL*1024*1024*1024)) {
6633 d = (double)n/(1024LL*1024*1024);
6634 sprintf(s,"%.2fG",d);
6635 }
6636 }
6637
6638 /* Create the string returned by the INFO command. This is decoupled
6639 * by the INFO command itself as we need to report the same information
6640 * on memory corruption problems. */
6641 static sds genRedisInfoString(void) {
6642 sds info;
6643 time_t uptime = time(NULL)-server.stat_starttime;
6644 int j;
6645 char hmem[64];
6646
6647 bytesToHuman(hmem,zmalloc_used_memory());
6648 info = sdscatprintf(sdsempty(),
6649 "redis_version:%s\r\n"
6650 "arch_bits:%s\r\n"
6651 "multiplexing_api:%s\r\n"
6652 "process_id:%ld\r\n"
6653 "uptime_in_seconds:%ld\r\n"
6654 "uptime_in_days:%ld\r\n"
6655 "connected_clients:%d\r\n"
6656 "connected_slaves:%d\r\n"
6657 "blocked_clients:%d\r\n"
6658 "used_memory:%zu\r\n"
6659 "used_memory_human:%s\r\n"
6660 "changes_since_last_save:%lld\r\n"
6661 "bgsave_in_progress:%d\r\n"
6662 "last_save_time:%ld\r\n"
6663 "bgrewriteaof_in_progress:%d\r\n"
6664 "total_connections_received:%lld\r\n"
6665 "total_commands_processed:%lld\r\n"
6666 "expired_keys:%lld\r\n"
6667 "hash_max_zipmap_entries:%ld\r\n"
6668 "hash_max_zipmap_value:%ld\r\n"
6669 "vm_enabled:%d\r\n"
6670 "role:%s\r\n"
6671 ,REDIS_VERSION,
6672 (sizeof(long) == 8) ? "64" : "32",
6673 aeGetApiName(),
6674 (long) getpid(),
6675 uptime,
6676 uptime/(3600*24),
6677 listLength(server.clients)-listLength(server.slaves),
6678 listLength(server.slaves),
6679 server.blpop_blocked_clients,
6680 zmalloc_used_memory(),
6681 hmem,
6682 server.dirty,
6683 server.bgsavechildpid != -1,
6684 server.lastsave,
6685 server.bgrewritechildpid != -1,
6686 server.stat_numconnections,
6687 server.stat_numcommands,
6688 server.stat_expiredkeys,
6689 server.hash_max_zipmap_entries,
6690 server.hash_max_zipmap_value,
6691 server.vm_enabled != 0,
6692 server.masterhost == NULL ? "master" : "slave"
6693 );
6694 if (server.masterhost) {
6695 info = sdscatprintf(info,
6696 "master_host:%s\r\n"
6697 "master_port:%d\r\n"
6698 "master_link_status:%s\r\n"
6699 "master_last_io_seconds_ago:%d\r\n"
6700 ,server.masterhost,
6701 server.masterport,
6702 (server.replstate == REDIS_REPL_CONNECTED) ?
6703 "up" : "down",
6704 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6705 );
6706 }
6707 if (server.vm_enabled) {
6708 lockThreadedIO();
6709 info = sdscatprintf(info,
6710 "vm_conf_max_memory:%llu\r\n"
6711 "vm_conf_page_size:%llu\r\n"
6712 "vm_conf_pages:%llu\r\n"
6713 "vm_stats_used_pages:%llu\r\n"
6714 "vm_stats_swapped_objects:%llu\r\n"
6715 "vm_stats_swappin_count:%llu\r\n"
6716 "vm_stats_swappout_count:%llu\r\n"
6717 "vm_stats_io_newjobs_len:%lu\r\n"
6718 "vm_stats_io_processing_len:%lu\r\n"
6719 "vm_stats_io_processed_len:%lu\r\n"
6720 "vm_stats_io_active_threads:%lu\r\n"
6721 "vm_stats_blocked_clients:%lu\r\n"
6722 ,(unsigned long long) server.vm_max_memory,
6723 (unsigned long long) server.vm_page_size,
6724 (unsigned long long) server.vm_pages,
6725 (unsigned long long) server.vm_stats_used_pages,
6726 (unsigned long long) server.vm_stats_swapped_objects,
6727 (unsigned long long) server.vm_stats_swapins,
6728 (unsigned long long) server.vm_stats_swapouts,
6729 (unsigned long) listLength(server.io_newjobs),
6730 (unsigned long) listLength(server.io_processing),
6731 (unsigned long) listLength(server.io_processed),
6732 (unsigned long) server.io_active_threads,
6733 (unsigned long) server.vm_blocked_clients
6734 );
6735 unlockThreadedIO();
6736 }
6737 for (j = 0; j < server.dbnum; j++) {
6738 long long keys, vkeys;
6739
6740 keys = dictSize(server.db[j].dict);
6741 vkeys = dictSize(server.db[j].expires);
6742 if (keys || vkeys) {
6743 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6744 j, keys, vkeys);
6745 }
6746 }
6747 return info;
6748 }
6749
6750 static void infoCommand(redisClient *c) {
6751 sds info = genRedisInfoString();
6752 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6753 (unsigned long)sdslen(info)));
6754 addReplySds(c,info);
6755 addReply(c,shared.crlf);
6756 }
6757
6758 static void monitorCommand(redisClient *c) {
6759 /* ignore MONITOR if aleady slave or in monitor mode */
6760 if (c->flags & REDIS_SLAVE) return;
6761
6762 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6763 c->slaveseldb = 0;
6764 listAddNodeTail(server.monitors,c);
6765 addReply(c,shared.ok);
6766 }
6767
6768 /* ================================= Expire ================================= */
6769 static int removeExpire(redisDb *db, robj *key) {
6770 if (dictDelete(db->expires,key) == DICT_OK) {
6771 return 1;
6772 } else {
6773 return 0;
6774 }
6775 }
6776
6777 static int setExpire(redisDb *db, robj *key, time_t when) {
6778 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6779 return 0;
6780 } else {
6781 incrRefCount(key);
6782 return 1;
6783 }
6784 }
6785
6786 /* Return the expire time of the specified key, or -1 if no expire
6787 * is associated with this key (i.e. the key is non volatile) */
6788 static time_t getExpire(redisDb *db, robj *key) {
6789 dictEntry *de;
6790
6791 /* No expire? return ASAP */
6792 if (dictSize(db->expires) == 0 ||
6793 (de = dictFind(db->expires,key)) == NULL) return -1;
6794
6795 return (time_t) dictGetEntryVal(de);
6796 }
6797
6798 static int expireIfNeeded(redisDb *db, robj *key) {
6799 time_t when;
6800 dictEntry *de;
6801
6802 /* No expire? return ASAP */
6803 if (dictSize(db->expires) == 0 ||
6804 (de = dictFind(db->expires,key)) == NULL) return 0;
6805
6806 /* Lookup the expire */
6807 when = (time_t) dictGetEntryVal(de);
6808 if (time(NULL) <= when) return 0;
6809
6810 /* Delete the key */
6811 dictDelete(db->expires,key);
6812 server.stat_expiredkeys++;
6813 return dictDelete(db->dict,key) == DICT_OK;
6814 }
6815
6816 static int deleteIfVolatile(redisDb *db, robj *key) {
6817 dictEntry *de;
6818
6819 /* No expire? return ASAP */
6820 if (dictSize(db->expires) == 0 ||
6821 (de = dictFind(db->expires,key)) == NULL) return 0;
6822
6823 /* Delete the key */
6824 server.dirty++;
6825 server.stat_expiredkeys++;
6826 dictDelete(db->expires,key);
6827 return dictDelete(db->dict,key) == DICT_OK;
6828 }
6829
6830 static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
6831 dictEntry *de;
6832
6833 de = dictFind(c->db->dict,key);
6834 if (de == NULL) {
6835 addReply(c,shared.czero);
6836 return;
6837 }
6838 if (seconds < 0) {
6839 if (deleteKey(c->db,key)) server.dirty++;
6840 addReply(c, shared.cone);
6841 return;
6842 } else {
6843 time_t when = time(NULL)+seconds;
6844 if (setExpire(c->db,key,when)) {
6845 addReply(c,shared.cone);
6846 server.dirty++;
6847 } else {
6848 addReply(c,shared.czero);
6849 }
6850 return;
6851 }
6852 }
6853
6854 static void expireCommand(redisClient *c) {
6855 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6856 }
6857
6858 static void expireatCommand(redisClient *c) {
6859 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6860 }
6861
6862 static void ttlCommand(redisClient *c) {
6863 time_t expire;
6864 int ttl = -1;
6865
6866 expire = getExpire(c->db,c->argv[1]);
6867 if (expire != -1) {
6868 ttl = (int) (expire-time(NULL));
6869 if (ttl < 0) ttl = -1;
6870 }
6871 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6872 }
6873
6874 /* ================================ MULTI/EXEC ============================== */
6875
6876 /* Client state initialization for MULTI/EXEC */
6877 static void initClientMultiState(redisClient *c) {
6878 c->mstate.commands = NULL;
6879 c->mstate.count = 0;
6880 }
6881
6882 /* Release all the resources associated with MULTI/EXEC state */
6883 static void freeClientMultiState(redisClient *c) {
6884 int j;
6885
6886 for (j = 0; j < c->mstate.count; j++) {
6887 int i;
6888 multiCmd *mc = c->mstate.commands+j;
6889
6890 for (i = 0; i < mc->argc; i++)
6891 decrRefCount(mc->argv[i]);
6892 zfree(mc->argv);
6893 }
6894 zfree(c->mstate.commands);
6895 }
6896
6897 /* Add a new command into the MULTI commands queue */
6898 static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6899 multiCmd *mc;
6900 int j;
6901
6902 c->mstate.commands = zrealloc(c->mstate.commands,
6903 sizeof(multiCmd)*(c->mstate.count+1));
6904 mc = c->mstate.commands+c->mstate.count;
6905 mc->cmd = cmd;
6906 mc->argc = c->argc;
6907 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6908 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6909 for (j = 0; j < c->argc; j++)
6910 incrRefCount(mc->argv[j]);
6911 c->mstate.count++;
6912 }
6913
6914 static void multiCommand(redisClient *c) {
6915 c->flags |= REDIS_MULTI;
6916 addReply(c,shared.ok);
6917 }
6918
6919 static void discardCommand(redisClient *c) {
6920 if (!(c->flags & REDIS_MULTI)) {
6921 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6922 return;
6923 }
6924
6925 freeClientMultiState(c);
6926 initClientMultiState(c);
6927 c->flags &= (~REDIS_MULTI);
6928 addReply(c,shared.ok);
6929 }
6930
6931 static void execCommand(redisClient *c) {
6932 int j;
6933 robj **orig_argv;
6934 int orig_argc;
6935
6936 if (!(c->flags & REDIS_MULTI)) {
6937 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6938 return;
6939 }
6940
6941 orig_argv = c->argv;
6942 orig_argc = c->argc;
6943 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6944 for (j = 0; j < c->mstate.count; j++) {
6945 c->argc = c->mstate.commands[j].argc;
6946 c->argv = c->mstate.commands[j].argv;
6947 call(c,c->mstate.commands[j].cmd);
6948 }
6949 c->argv = orig_argv;
6950 c->argc = orig_argc;
6951 freeClientMultiState(c);
6952 initClientMultiState(c);
6953 c->flags &= (~REDIS_MULTI);
6954 }
6955
6956 /* =========================== Blocking Operations ========================= */
6957
6958 /* Currently Redis blocking operations support is limited to list POP ops,
6959 * so the current implementation is not fully generic, but it is also not
6960 * completely specific so it will not require a rewrite to support new
6961 * kind of blocking operations in the future.
6962 *
6963 * Still it's important to note that list blocking operations can be already
6964 * used as a notification mechanism in order to implement other blocking
6965 * operations at application level, so there must be a very strong evidence
6966 * of usefulness and generality before new blocking operations are implemented.
6967 *
6968 * This is how the current blocking POP works, we use BLPOP as example:
6969 * - If the user calls BLPOP and the key exists and contains a non empty list
6970 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6971 * if there is not to block.
6972 * - If instead BLPOP is called and the key does not exists or the list is
6973 * empty we need to block. In order to do so we remove the notification for
6974 * new data to read in the client socket (so that we'll not serve new
6975 * requests if the blocking request is not served). Also we put the client
6976 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
6977 * blocking for this keys.
6978 * - If a PUSH operation against a key with blocked clients waiting is
6979 * performed, we serve the first in the list: basically instead to push
6980 * the new element inside the list we return it to the (first / oldest)
6981 * blocking client, unblock the client, and remove it form the list.
6982 *
6983 * The above comment and the source code should be enough in order to understand
6984 * the implementation and modify / fix it later.
6985 */
6986
6987 /* Set a client in blocking mode for the specified key, with the specified
6988 * timeout */
6989 static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
6990 dictEntry *de;
6991 list *l;
6992 int j;
6993
6994 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
6995 c->blockingkeysnum = numkeys;
6996 c->blockingto = timeout;
6997 for (j = 0; j < numkeys; j++) {
6998 /* Add the key in the client structure, to map clients -> keys */
6999 c->blockingkeys[j] = keys[j];
7000 incrRefCount(keys[j]);
7001
7002 /* And in the other "side", to map keys -> clients */
7003 de = dictFind(c->db->blockingkeys,keys[j]);
7004 if (de == NULL) {
7005 int retval;
7006
7007 /* For every key we take a list of clients blocked for it */
7008 l = listCreate();
7009 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7010 incrRefCount(keys[j]);
7011 assert(retval == DICT_OK);
7012 } else {
7013 l = dictGetEntryVal(de);
7014 }
7015 listAddNodeTail(l,c);
7016 }
7017 /* Mark the client as a blocked client */
7018 c->flags |= REDIS_BLOCKED;
7019 server.blpop_blocked_clients++;
7020 }
7021
7022 /* Unblock a client that's waiting in a blocking operation such as BLPOP */
7023 static void unblockClientWaitingData(redisClient *c) {
7024 dictEntry *de;
7025 list *l;
7026 int j;
7027
7028 assert(c->blockingkeys != NULL);
7029 /* The client may wait for multiple keys, so unblock it for every key. */
7030 for (j = 0; j < c->blockingkeysnum; j++) {
7031 /* Remove this client from the list of clients waiting for this key. */
7032 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7033 assert(de != NULL);
7034 l = dictGetEntryVal(de);
7035 listDelNode(l,listSearchKey(l,c));
7036 /* If the list is empty we need to remove it to avoid wasting memory */
7037 if (listLength(l) == 0)
7038 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7039 decrRefCount(c->blockingkeys[j]);
7040 }
7041 /* Cleanup the client structure */
7042 zfree(c->blockingkeys);
7043 c->blockingkeys = NULL;
7044 c->flags &= (~REDIS_BLOCKED);
7045 server.blpop_blocked_clients--;
7046 /* We want to process data if there is some command waiting
7047 * in the input buffer. Note that this is safe even if
7048 * unblockClientWaitingData() gets called from freeClient() because
7049 * freeClient() will be smart enough to call this function
7050 * *after* c->querybuf was set to NULL. */
7051 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7052 }
7053
7054 /* This should be called from any function PUSHing into lists.
7055 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7056 * 'ele' is the element pushed.
7057 *
7058 * If the function returns 0 there was no client waiting for a list push
7059 * against this key.
7060 *
7061 * If the function returns 1 there was a client waiting for a list push
7062 * against this key, the element was passed to this client thus it's not
7063 * needed to actually add it to the list and the caller should return asap. */
7064 static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7065 struct dictEntry *de;
7066 redisClient *receiver;
7067 list *l;
7068 listNode *ln;
7069
7070 de = dictFind(c->db->blockingkeys,key);
7071 if (de == NULL) return 0;
7072 l = dictGetEntryVal(de);
7073 ln = listFirst(l);
7074 assert(ln != NULL);
7075 receiver = ln->value;
7076
7077 addReplySds(receiver,sdsnew("*2\r\n"));
7078 addReplyBulk(receiver,key);
7079 addReplyBulk(receiver,ele);
7080 unblockClientWaitingData(receiver);
7081 return 1;
7082 }
7083
7084 /* Blocking RPOP/LPOP */
7085 static void blockingPopGenericCommand(redisClient *c, int where) {
7086 robj *o;
7087 time_t timeout;
7088 int j;
7089
7090 for (j = 1; j < c->argc-1; j++) {
7091 o = lookupKeyWrite(c->db,c->argv[j]);
7092 if (o != NULL) {
7093 if (o->type != REDIS_LIST) {
7094 addReply(c,shared.wrongtypeerr);
7095 return;
7096 } else {
7097 list *list = o->ptr;
7098 if (listLength(list) != 0) {
7099 /* If the list contains elements fall back to the usual
7100 * non-blocking POP operation */
7101 robj *argv[2], **orig_argv;
7102 int orig_argc;
7103
7104 /* We need to alter the command arguments before to call
7105 * popGenericCommand() as the command takes a single key. */
7106 orig_argv = c->argv;
7107 orig_argc = c->argc;
7108 argv[1] = c->argv[j];
7109 c->argv = argv;
7110 c->argc = 2;
7111
7112 /* Also the return value is different, we need to output
7113 * the multi bulk reply header and the key name. The
7114 * "real" command will add the last element (the value)
7115 * for us. If this souds like an hack to you it's just
7116 * because it is... */
7117 addReplySds(c,sdsnew("*2\r\n"));
7118 addReplyBulk(c,argv[1]);
7119 popGenericCommand(c,where);
7120
7121 /* Fix the client structure with the original stuff */
7122 c->argv = orig_argv;
7123 c->argc = orig_argc;
7124 return;
7125 }
7126 }
7127 }
7128 }
7129 /* If the list is empty or the key does not exists we must block */
7130 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
7131 if (timeout > 0) timeout += time(NULL);
7132 blockForKeys(c,c->argv+1,c->argc-2,timeout);
7133 }
7134
7135 static void blpopCommand(redisClient *c) {
7136 blockingPopGenericCommand(c,REDIS_HEAD);
7137 }
7138
7139 static void brpopCommand(redisClient *c) {
7140 blockingPopGenericCommand(c,REDIS_TAIL);
7141 }
7142
7143 /* =============================== Replication ============================= */
7144
7145 static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
7146 ssize_t nwritten, ret = size;
7147 time_t start = time(NULL);
7148
7149 timeout++;
7150 while(size) {
7151 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7152 nwritten = write(fd,ptr,size);
7153 if (nwritten == -1) return -1;
7154 ptr += nwritten;
7155 size -= nwritten;
7156 }
7157 if ((time(NULL)-start) > timeout) {
7158 errno = ETIMEDOUT;
7159 return -1;
7160 }
7161 }
7162 return ret;
7163 }
7164
7165 static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
7166 ssize_t nread, totread = 0;
7167 time_t start = time(NULL);
7168
7169 timeout++;
7170 while(size) {
7171 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7172 nread = read(fd,ptr,size);
7173 if (nread == -1) return -1;
7174 ptr += nread;
7175 size -= nread;
7176 totread += nread;
7177 }
7178 if ((time(NULL)-start) > timeout) {
7179 errno = ETIMEDOUT;
7180 return -1;
7181 }
7182 }
7183 return totread;
7184 }
7185
7186 static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7187 ssize_t nread = 0;
7188
7189 size--;
7190 while(size) {
7191 char c;
7192
7193 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7194 if (c == '\n') {
7195 *ptr = '\0';
7196 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7197 return nread;
7198 } else {
7199 *ptr++ = c;
7200 *ptr = '\0';
7201 nread++;
7202 }
7203 }
7204 return nread;
7205 }
7206
7207 static void syncCommand(redisClient *c) {
7208 /* ignore SYNC if aleady slave or in monitor mode */
7209 if (c->flags & REDIS_SLAVE) return;
7210
7211 /* SYNC can't be issued when the server has pending data to send to
7212 * the client about already issued commands. We need a fresh reply
7213 * buffer registering the differences between the BGSAVE and the current
7214 * dataset, so that we can copy to other slaves if needed. */
7215 if (listLength(c->reply) != 0) {
7216 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7217 return;
7218 }
7219
7220 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7221 /* Here we need to check if there is a background saving operation
7222 * in progress, or if it is required to start one */
7223 if (server.bgsavechildpid != -1) {
7224 /* Ok a background save is in progress. Let's check if it is a good
7225 * one for replication, i.e. if there is another slave that is
7226 * registering differences since the server forked to save */
7227 redisClient *slave;
7228 listNode *ln;
7229 listIter li;
7230
7231 listRewind(server.slaves,&li);
7232 while((ln = listNext(&li))) {
7233 slave = ln->value;
7234 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
7235 }
7236 if (ln) {
7237 /* Perfect, the server is already registering differences for
7238 * another slave. Set the right state, and copy the buffer. */
7239 listRelease(c->reply);
7240 c->reply = listDup(slave->reply);
7241 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7242 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7243 } else {
7244 /* No way, we need to wait for the next BGSAVE in order to
7245 * register differences */
7246 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7247 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7248 }
7249 } else {
7250 /* Ok we don't have a BGSAVE in progress, let's start one */
7251 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7252 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7253 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7254 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7255 return;
7256 }
7257 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7258 }
7259 c->repldbfd = -1;
7260 c->flags |= REDIS_SLAVE;
7261 c->slaveseldb = 0;
7262 listAddNodeTail(server.slaves,c);
7263 return;
7264 }
7265
7266 static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7267 redisClient *slave = privdata;
7268 REDIS_NOTUSED(el);
7269 REDIS_NOTUSED(mask);
7270 char buf[REDIS_IOBUF_LEN];
7271 ssize_t nwritten, buflen;
7272
7273 if (slave->repldboff == 0) {
7274 /* Write the bulk write count before to transfer the DB. In theory here
7275 * we don't know how much room there is in the output buffer of the
7276 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7277 * operations) will never be smaller than the few bytes we need. */
7278 sds bulkcount;
7279
7280 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7281 slave->repldbsize);
7282 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7283 {
7284 sdsfree(bulkcount);
7285 freeClient(slave);
7286 return;
7287 }
7288 sdsfree(bulkcount);
7289 }
7290 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7291 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7292 if (buflen <= 0) {
7293 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7294 (buflen == 0) ? "premature EOF" : strerror(errno));
7295 freeClient(slave);
7296 return;
7297 }
7298 if ((nwritten = write(fd,buf,buflen)) == -1) {
7299 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7300 strerror(errno));
7301 freeClient(slave);
7302 return;
7303 }
7304 slave->repldboff += nwritten;
7305 if (slave->repldboff == slave->repldbsize) {
7306 close(slave->repldbfd);
7307 slave->repldbfd = -1;
7308 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7309 slave->replstate = REDIS_REPL_ONLINE;
7310 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7311 sendReplyToClient, slave) == AE_ERR) {
7312 freeClient(slave);
7313 return;
7314 }
7315 addReplySds(slave,sdsempty());
7316 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7317 }
7318 }
7319
7320 /* This function is called at the end of every backgrond saving.
7321 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7322 * otherwise REDIS_ERR is passed to the function.
7323 *
7324 * The goal of this function is to handle slaves waiting for a successful
7325 * background saving in order to perform non-blocking synchronization. */
7326 static void updateSlavesWaitingBgsave(int bgsaveerr) {
7327 listNode *ln;
7328 int startbgsave = 0;
7329 listIter li;
7330
7331 listRewind(server.slaves,&li);
7332 while((ln = listNext(&li))) {
7333 redisClient *slave = ln->value;
7334
7335 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7336 startbgsave = 1;
7337 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7338 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7339 struct redis_stat buf;
7340
7341 if (bgsaveerr != REDIS_OK) {
7342 freeClient(slave);
7343 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7344 continue;
7345 }
7346 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7347 redis_fstat(slave->repldbfd,&buf) == -1) {
7348 freeClient(slave);
7349 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7350 continue;
7351 }
7352 slave->repldboff = 0;
7353 slave->repldbsize = buf.st_size;
7354 slave->replstate = REDIS_REPL_SEND_BULK;
7355 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7356 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7357 freeClient(slave);
7358 continue;
7359 }
7360 }
7361 }
7362 if (startbgsave) {
7363 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7364 listIter li;
7365
7366 listRewind(server.slaves,&li);
7367 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
7368 while((ln = listNext(&li))) {
7369 redisClient *slave = ln->value;
7370
7371 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7372 freeClient(slave);
7373 }
7374 }
7375 }
7376 }
7377
7378 static int syncWithMaster(void) {
7379 char buf[1024], tmpfile[256], authcmd[1024];
7380 long dumpsize;
7381 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7382 int dfd, maxtries = 5;
7383
7384 if (fd == -1) {
7385 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7386 strerror(errno));
7387 return REDIS_ERR;
7388 }
7389
7390 /* AUTH with the master if required. */
7391 if(server.masterauth) {
7392 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7393 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7394 close(fd);
7395 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7396 strerror(errno));
7397 return REDIS_ERR;
7398 }
7399 /* Read the AUTH result. */
7400 if (syncReadLine(fd,buf,1024,3600) == -1) {
7401 close(fd);
7402 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7403 strerror(errno));
7404 return REDIS_ERR;
7405 }
7406 if (buf[0] != '+') {
7407 close(fd);
7408 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7409 return REDIS_ERR;
7410 }
7411 }
7412
7413 /* Issue the SYNC command */
7414 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7415 close(fd);
7416 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7417 strerror(errno));
7418 return REDIS_ERR;
7419 }
7420 /* Read the bulk write count */
7421 if (syncReadLine(fd,buf,1024,3600) == -1) {
7422 close(fd);
7423 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7424 strerror(errno));
7425 return REDIS_ERR;
7426 }
7427 if (buf[0] != '$') {
7428 close(fd);
7429 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7430 return REDIS_ERR;
7431 }
7432 dumpsize = strtol(buf+1,NULL,10);
7433 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
7434 /* Read the bulk write data on a temp file */
7435 while(maxtries--) {
7436 snprintf(tmpfile,256,
7437 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7438 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7439 if (dfd != -1) break;
7440 sleep(1);
7441 }
7442 if (dfd == -1) {
7443 close(fd);
7444 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7445 return REDIS_ERR;
7446 }
7447 while(dumpsize) {
7448 int nread, nwritten;
7449
7450 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7451 if (nread == -1) {
7452 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7453 strerror(errno));
7454 close(fd);
7455 close(dfd);
7456 return REDIS_ERR;
7457 }
7458 nwritten = write(dfd,buf,nread);
7459 if (nwritten == -1) {
7460 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7461 close(fd);
7462 close(dfd);
7463 return REDIS_ERR;
7464 }
7465 dumpsize -= nread;
7466 }
7467 close(dfd);
7468 if (rename(tmpfile,server.dbfilename) == -1) {
7469 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7470 unlink(tmpfile);
7471 close(fd);
7472 return REDIS_ERR;
7473 }
7474 emptyDb();
7475 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7476 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7477 close(fd);
7478 return REDIS_ERR;
7479 }
7480 server.master = createClient(fd);
7481 server.master->flags |= REDIS_MASTER;
7482 server.master->authenticated = 1;
7483 server.replstate = REDIS_REPL_CONNECTED;
7484 return REDIS_OK;
7485 }
7486
7487 static void slaveofCommand(redisClient *c) {
7488 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7489 !strcasecmp(c->argv[2]->ptr,"one")) {
7490 if (server.masterhost) {
7491 sdsfree(server.masterhost);
7492 server.masterhost = NULL;
7493 if (server.master) freeClient(server.master);
7494 server.replstate = REDIS_REPL_NONE;
7495 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7496 }
7497 } else {
7498 sdsfree(server.masterhost);
7499 server.masterhost = sdsdup(c->argv[1]->ptr);
7500 server.masterport = atoi(c->argv[2]->ptr);
7501 if (server.master) freeClient(server.master);
7502 server.replstate = REDIS_REPL_CONNECT;
7503 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7504 server.masterhost, server.masterport);
7505 }
7506 addReply(c,shared.ok);
7507 }
7508
7509 /* ============================ Maxmemory directive ======================== */
7510
7511 /* Try to free one object form the pre-allocated objects free list.
7512 * This is useful under low mem conditions as by default we take 1 million
7513 * free objects allocated. On success REDIS_OK is returned, otherwise
7514 * REDIS_ERR. */
7515 static int tryFreeOneObjectFromFreelist(void) {
7516 robj *o;
7517
7518 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7519 if (listLength(server.objfreelist)) {
7520 listNode *head = listFirst(server.objfreelist);
7521 o = listNodeValue(head);
7522 listDelNode(server.objfreelist,head);
7523 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7524 zfree(o);
7525 return REDIS_OK;
7526 } else {
7527 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7528 return REDIS_ERR;
7529 }
7530 }
7531
7532 /* This function gets called when 'maxmemory' is set on the config file to limit
7533 * the max memory used by the server, and we are out of memory.
7534 * This function will try to, in order:
7535 *
7536 * - Free objects from the free list
7537 * - Try to remove keys with an EXPIRE set
7538 *
7539 * It is not possible to free enough memory to reach used-memory < maxmemory
7540 * the server will start refusing commands that will enlarge even more the
7541 * memory usage.
7542 */
7543 static void freeMemoryIfNeeded(void) {
7544 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7545 int j, k, freed = 0;
7546
7547 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7548 for (j = 0; j < server.dbnum; j++) {
7549 int minttl = -1;
7550 robj *minkey = NULL;
7551 struct dictEntry *de;
7552
7553 if (dictSize(server.db[j].expires)) {
7554 freed = 1;
7555 /* From a sample of three keys drop the one nearest to
7556 * the natural expire */
7557 for (k = 0; k < 3; k++) {
7558 time_t t;
7559
7560 de = dictGetRandomKey(server.db[j].expires);
7561 t = (time_t) dictGetEntryVal(de);
7562 if (minttl == -1 || t < minttl) {
7563 minkey = dictGetEntryKey(de);
7564 minttl = t;
7565 }
7566 }
7567 deleteKey(server.db+j,minkey);
7568 }
7569 }
7570 if (!freed) return; /* nothing to free... */
7571 }
7572 }
7573
7574 /* ============================== Append Only file ========================== */
7575
7576 static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7577 sds buf = sdsempty();
7578 int j;
7579 ssize_t nwritten;
7580 time_t now;
7581 robj *tmpargv[3];
7582
7583 /* The DB this command was targetting is not the same as the last command
7584 * we appendend. To issue a SELECT command is needed. */
7585 if (dictid != server.appendseldb) {
7586 char seldb[64];
7587
7588 snprintf(seldb,sizeof(seldb),"%d",dictid);
7589 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
7590 (unsigned long)strlen(seldb),seldb);
7591 server.appendseldb = dictid;
7592 }
7593
7594 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7595 * EXPIREs into EXPIREATs calls */
7596 if (cmd->proc == expireCommand) {
7597 long when;
7598
7599 tmpargv[0] = createStringObject("EXPIREAT",8);
7600 tmpargv[1] = argv[1];
7601 incrRefCount(argv[1]);
7602 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7603 tmpargv[2] = createObject(REDIS_STRING,
7604 sdscatprintf(sdsempty(),"%ld",when));
7605 argv = tmpargv;
7606 }
7607
7608 /* Append the actual command */
7609 buf = sdscatprintf(buf,"*%d\r\n",argc);
7610 for (j = 0; j < argc; j++) {
7611 robj *o = argv[j];
7612
7613 o = getDecodedObject(o);
7614 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7615 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7616 buf = sdscatlen(buf,"\r\n",2);
7617 decrRefCount(o);
7618 }
7619
7620 /* Free the objects from the modified argv for EXPIREAT */
7621 if (cmd->proc == expireCommand) {
7622 for (j = 0; j < 3; j++)
7623 decrRefCount(argv[j]);
7624 }
7625
7626 /* We want to perform a single write. This should be guaranteed atomic
7627 * at least if the filesystem we are writing is a real physical one.
7628 * While this will save us against the server being killed I don't think
7629 * there is much to do about the whole server stopping for power problems
7630 * or alike */
7631 nwritten = write(server.appendfd,buf,sdslen(buf));
7632 if (nwritten != (signed)sdslen(buf)) {
7633 /* Ooops, we are in troubles. The best thing to do for now is
7634 * to simply exit instead to give the illusion that everything is
7635 * working as expected. */
7636 if (nwritten == -1) {
7637 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7638 } else {
7639 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7640 }
7641 exit(1);
7642 }
7643 /* If a background append only file rewriting is in progress we want to
7644 * accumulate the differences between the child DB and the current one
7645 * in a buffer, so that when the child process will do its work we
7646 * can append the differences to the new append only file. */
7647 if (server.bgrewritechildpid != -1)
7648 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7649
7650 sdsfree(buf);
7651 now = time(NULL);
7652 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7653 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7654 now-server.lastfsync > 1))
7655 {
7656 fsync(server.appendfd); /* Let's try to get this data on the disk */
7657 server.lastfsync = now;
7658 }
7659 }
7660
7661 /* In Redis commands are always executed in the context of a client, so in
7662 * order to load the append only file we need to create a fake client. */
7663 static struct redisClient *createFakeClient(void) {
7664 struct redisClient *c = zmalloc(sizeof(*c));
7665
7666 selectDb(c,0);
7667 c->fd = -1;
7668 c->querybuf = sdsempty();
7669 c->argc = 0;
7670 c->argv = NULL;
7671 c->flags = 0;
7672 /* We set the fake client as a slave waiting for the synchronization
7673 * so that Redis will not try to send replies to this client. */
7674 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7675 c->reply = listCreate();
7676 listSetFreeMethod(c->reply,decrRefCount);
7677 listSetDupMethod(c->reply,dupClientReplyValue);
7678 return c;
7679 }
7680
7681 static void freeFakeClient(struct redisClient *c) {
7682 sdsfree(c->querybuf);
7683 listRelease(c->reply);
7684 zfree(c);
7685 }
7686
7687 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
7688 * error (the append only file is zero-length) REDIS_ERR is returned. On
7689 * fatal error an error message is logged and the program exists. */
7690 int loadAppendOnlyFile(char *filename) {
7691 struct redisClient *fakeClient;
7692 FILE *fp = fopen(filename,"r");
7693 struct redis_stat sb;
7694 unsigned long long loadedkeys = 0;
7695
7696 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7697 return REDIS_ERR;
7698
7699 if (fp == NULL) {
7700 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7701 exit(1);
7702 }
7703
7704 fakeClient = createFakeClient();
7705 while(1) {
7706 int argc, j;
7707 unsigned long len;
7708 robj **argv;
7709 char buf[128];
7710 sds argsds;
7711 struct redisCommand *cmd;
7712
7713 if (fgets(buf,sizeof(buf),fp) == NULL) {
7714 if (feof(fp))
7715 break;
7716 else
7717 goto readerr;
7718 }
7719 if (buf[0] != '*') goto fmterr;
7720 argc = atoi(buf+1);
7721 argv = zmalloc(sizeof(robj*)*argc);
7722 for (j = 0; j < argc; j++) {
7723 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7724 if (buf[0] != '$') goto fmterr;
7725 len = strtol(buf+1,NULL,10);
7726 argsds = sdsnewlen(NULL,len);
7727 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7728 argv[j] = createObject(REDIS_STRING,argsds);
7729 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7730 }
7731
7732 /* Command lookup */
7733 cmd = lookupCommand(argv[0]->ptr);
7734 if (!cmd) {
7735 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7736 exit(1);
7737 }
7738 /* Try object sharing and encoding */
7739 if (server.shareobjects) {
7740 int j;
7741 for(j = 1; j < argc; j++)
7742 argv[j] = tryObjectSharing(argv[j]);
7743 }
7744 if (cmd->flags & REDIS_CMD_BULK)
7745 tryObjectEncoding(argv[argc-1]);
7746 /* Run the command in the context of a fake client */
7747 fakeClient->argc = argc;
7748 fakeClient->argv = argv;
7749 cmd->proc(fakeClient);
7750 /* Discard the reply objects list from the fake client */
7751 while(listLength(fakeClient->reply))
7752 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7753 /* Clean up, ready for the next command */
7754 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7755 zfree(argv);
7756 /* Handle swapping while loading big datasets when VM is on */
7757 loadedkeys++;
7758 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7759 while (zmalloc_used_memory() > server.vm_max_memory) {
7760 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7761 }
7762 }
7763 }
7764 fclose(fp);
7765 freeFakeClient(fakeClient);
7766 return REDIS_OK;
7767
7768 readerr:
7769 if (feof(fp)) {
7770 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7771 } else {
7772 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7773 }
7774 exit(1);
7775 fmterr:
7776 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7777 exit(1);
7778 }
7779
7780 /* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
7781 static int fwriteBulkObject(FILE *fp, robj *obj) {
7782 char buf[128];
7783 int decrrc = 0;
7784
7785 /* Avoid the incr/decr ref count business if possible to help
7786 * copy-on-write (we are often in a child process when this function
7787 * is called).
7788 * Also makes sure that key objects don't get incrRefCount-ed when VM
7789 * is enabled */
7790 if (obj->encoding != REDIS_ENCODING_RAW) {
7791 obj = getDecodedObject(obj);
7792 decrrc = 1;
7793 }
7794 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7795 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
7796 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7797 goto err;
7798 if (fwrite("\r\n",2,1,fp) == 0) goto err;
7799 if (decrrc) decrRefCount(obj);
7800 return 1;
7801 err:
7802 if (decrrc) decrRefCount(obj);
7803 return 0;
7804 }
7805
7806 /* Write binary-safe string into a file in the bulkformat
7807 * $<count>\r\n<payload>\r\n */
7808 static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7809 char buf[128];
7810
7811 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7812 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7813 if (len && fwrite(s,len,1,fp) == 0) return 0;
7814 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7815 return 1;
7816 }
7817
7818 /* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7819 static int fwriteBulkDouble(FILE *fp, double d) {
7820 char buf[128], dbuf[128];
7821
7822 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7823 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7824 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7825 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7826 return 1;
7827 }
7828
7829 /* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7830 static int fwriteBulkLong(FILE *fp, long l) {
7831 char buf[128], lbuf[128];
7832
7833 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7834 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7835 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7836 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7837 return 1;
7838 }
7839
7840 /* Write a sequence of commands able to fully rebuild the dataset into
7841 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7842 static int rewriteAppendOnlyFile(char *filename) {
7843 dictIterator *di = NULL;
7844 dictEntry *de;
7845 FILE *fp;
7846 char tmpfile[256];
7847 int j;
7848 time_t now = time(NULL);
7849
7850 /* Note that we have to use a different temp name here compared to the
7851 * one used by rewriteAppendOnlyFileBackground() function. */
7852 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7853 fp = fopen(tmpfile,"w");
7854 if (!fp) {
7855 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7856 return REDIS_ERR;
7857 }
7858 for (j = 0; j < server.dbnum; j++) {
7859 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7860 redisDb *db = server.db+j;
7861 dict *d = db->dict;
7862 if (dictSize(d) == 0) continue;
7863 di = dictGetIterator(d);
7864 if (!di) {
7865 fclose(fp);
7866 return REDIS_ERR;
7867 }
7868
7869 /* SELECT the new DB */
7870 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
7871 if (fwriteBulkLong(fp,j) == 0) goto werr;
7872
7873 /* Iterate this DB writing every entry */
7874 while((de = dictNext(di)) != NULL) {
7875 robj *key, *o;
7876 time_t expiretime;
7877 int swapped;
7878
7879 key = dictGetEntryKey(de);
7880 /* If the value for this key is swapped, load a preview in memory.
7881 * We use a "swapped" flag to remember if we need to free the
7882 * value object instead to just increment the ref count anyway
7883 * in order to avoid copy-on-write of pages if we are forked() */
7884 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7885 key->storage == REDIS_VM_SWAPPING) {
7886 o = dictGetEntryVal(de);
7887 swapped = 0;
7888 } else {
7889 o = vmPreviewObject(key);
7890 swapped = 1;
7891 }
7892 expiretime = getExpire(db,key);
7893
7894 /* Save the key and associated value */
7895 if (o->type == REDIS_STRING) {
7896 /* Emit a SET command */
7897 char cmd[]="*3\r\n$3\r\nSET\r\n";
7898 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7899 /* Key and value */
7900 if (fwriteBulkObject(fp,key) == 0) goto werr;
7901 if (fwriteBulkObject(fp,o) == 0) goto werr;
7902 } else if (o->type == REDIS_LIST) {
7903 /* Emit the RPUSHes needed to rebuild the list */
7904 list *list = o->ptr;
7905 listNode *ln;
7906 listIter li;
7907
7908 listRewind(list,&li);
7909 while((ln = listNext(&li))) {
7910 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7911 robj *eleobj = listNodeValue(ln);
7912
7913 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7914 if (fwriteBulkObject(fp,key) == 0) goto werr;
7915 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7916 }
7917 } else if (o->type == REDIS_SET) {
7918 /* Emit the SADDs needed to rebuild the set */
7919 dict *set = o->ptr;
7920 dictIterator *di = dictGetIterator(set);
7921 dictEntry *de;
7922
7923 while((de = dictNext(di)) != NULL) {
7924 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7925 robj *eleobj = dictGetEntryKey(de);
7926
7927 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7928 if (fwriteBulkObject(fp,key) == 0) goto werr;
7929 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7930 }
7931 dictReleaseIterator(di);
7932 } else if (o->type == REDIS_ZSET) {
7933 /* Emit the ZADDs needed to rebuild the sorted set */
7934 zset *zs = o->ptr;
7935 dictIterator *di = dictGetIterator(zs->dict);
7936 dictEntry *de;
7937
7938 while((de = dictNext(di)) != NULL) {
7939 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7940 robj *eleobj = dictGetEntryKey(de);
7941 double *score = dictGetEntryVal(de);
7942
7943 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7944 if (fwriteBulkObject(fp,key) == 0) goto werr;
7945 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
7946 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
7947 }
7948 dictReleaseIterator(di);
7949 } else if (o->type == REDIS_HASH) {
7950 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7951
7952 /* Emit the HSETs needed to rebuild the hash */
7953 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7954 unsigned char *p = zipmapRewind(o->ptr);
7955 unsigned char *field, *val;
7956 unsigned int flen, vlen;
7957
7958 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7959 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7960 if (fwriteBulkObject(fp,key) == 0) goto werr;
7961 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7962 return -1;
7963 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7964 return -1;
7965 }
7966 } else {
7967 dictIterator *di = dictGetIterator(o->ptr);
7968 dictEntry *de;
7969
7970 while((de = dictNext(di)) != NULL) {
7971 robj *field = dictGetEntryKey(de);
7972 robj *val = dictGetEntryVal(de);
7973
7974 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7975 if (fwriteBulkObject(fp,key) == 0) goto werr;
7976 if (fwriteBulkObject(fp,field) == -1) return -1;
7977 if (fwriteBulkObject(fp,val) == -1) return -1;
7978 }
7979 dictReleaseIterator(di);
7980 }
7981 } else {
7982 redisAssert(0);
7983 }
7984 /* Save the expire time */
7985 if (expiretime != -1) {
7986 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
7987 /* If this key is already expired skip it */
7988 if (expiretime < now) continue;
7989 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7990 if (fwriteBulkObject(fp,key) == 0) goto werr;
7991 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7992 }
7993 if (swapped) decrRefCount(o);
7994 }
7995 dictReleaseIterator(di);
7996 }
7997
7998 /* Make sure data will not remain on the OS's output buffers */
7999 fflush(fp);
8000 fsync(fileno(fp));
8001 fclose(fp);
8002
8003 /* Use RENAME to make sure the DB file is changed atomically only
8004 * if the generate DB file is ok. */
8005 if (rename(tmpfile,filename) == -1) {
8006 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8007 unlink(tmpfile);
8008 return REDIS_ERR;
8009 }
8010 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8011 return REDIS_OK;
8012
8013 werr:
8014 fclose(fp);
8015 unlink(tmpfile);
8016 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8017 if (di) dictReleaseIterator(di);
8018 return REDIS_ERR;
8019 }
8020
8021 /* This is how rewriting of the append only file in background works:
8022 *
8023 * 1) The user calls BGREWRITEAOF
8024 * 2) Redis calls this function, that forks():
8025 * 2a) the child rewrite the append only file in a temp file.
8026 * 2b) the parent accumulates differences in server.bgrewritebuf.
8027 * 3) When the child finished '2a' exists.
8028 * 4) The parent will trap the exit code, if it's OK, will append the
8029 * data accumulated into server.bgrewritebuf into the temp file, and
8030 * finally will rename(2) the temp file in the actual file name.
8031 * The the new file is reopened as the new append only file. Profit!
8032 */
8033 static int rewriteAppendOnlyFileBackground(void) {
8034 pid_t childpid;
8035
8036 if (server.bgrewritechildpid != -1) return REDIS_ERR;
8037 if (server.vm_enabled) waitEmptyIOJobsQueue();
8038 if ((childpid = fork()) == 0) {
8039 /* Child */
8040 char tmpfile[256];
8041
8042 if (server.vm_enabled) vmReopenSwapFile();
8043 close(server.fd);
8044 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8045 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8046 _exit(0);
8047 } else {
8048 _exit(1);
8049 }
8050 } else {
8051 /* Parent */
8052 if (childpid == -1) {
8053 redisLog(REDIS_WARNING,
8054 "Can't rewrite append only file in background: fork: %s",
8055 strerror(errno));
8056 return REDIS_ERR;
8057 }
8058 redisLog(REDIS_NOTICE,
8059 "Background append only file rewriting started by pid %d",childpid);
8060 server.bgrewritechildpid = childpid;
8061 /* We set appendseldb to -1 in order to force the next call to the
8062 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8063 * accumulated by the parent into server.bgrewritebuf will start
8064 * with a SELECT statement and it will be safe to merge. */
8065 server.appendseldb = -1;
8066 return REDIS_OK;
8067 }
8068 return REDIS_OK; /* unreached */
8069 }
8070
8071 static void bgrewriteaofCommand(redisClient *c) {
8072 if (server.bgrewritechildpid != -1) {
8073 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8074 return;
8075 }
8076 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8077 char *status = "+Background append only file rewriting started\r\n";
8078 addReplySds(c,sdsnew(status));
8079 } else {
8080 addReply(c,shared.err);
8081 }
8082 }
8083
8084 static void aofRemoveTempFile(pid_t childpid) {
8085 char tmpfile[256];
8086
8087 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8088 unlink(tmpfile);
8089 }
8090
8091 /* Virtual Memory is composed mainly of two subsystems:
8092 * - Blocking Virutal Memory
8093 * - Threaded Virtual Memory I/O
8094 * The two parts are not fully decoupled, but functions are split among two
8095 * different sections of the source code (delimited by comments) in order to
8096 * make more clear what functionality is about the blocking VM and what about
8097 * the threaded (not blocking) VM.
8098 *
8099 * Redis VM design:
8100 *
8101 * Redis VM is a blocking VM (one that blocks reading swapped values from
8102 * disk into memory when a value swapped out is needed in memory) that is made
8103 * unblocking by trying to examine the command argument vector in order to
8104 * load in background values that will likely be needed in order to exec
8105 * the command. The command is executed only once all the relevant keys
8106 * are loaded into memory.
8107 *
8108 * This basically is almost as simple of a blocking VM, but almost as parallel
8109 * as a fully non-blocking VM.
8110 */
8111
8112 /* =================== Virtual Memory - Blocking Side ====================== */
8113
8114 /* substitute the first occurrence of '%p' with the process pid in the
8115 * swap file name. */
8116 static void expandVmSwapFilename(void) {
8117 char *p = strstr(server.vm_swap_file,"%p");
8118 sds new;
8119
8120 if (!p) return;
8121 new = sdsempty();
8122 *p = '\0';
8123 new = sdscat(new,server.vm_swap_file);
8124 new = sdscatprintf(new,"%ld",(long) getpid());
8125 new = sdscat(new,p+2);
8126 zfree(server.vm_swap_file);
8127 server.vm_swap_file = new;
8128 }
8129
8130 static void vmInit(void) {
8131 off_t totsize;
8132 int pipefds[2];
8133 size_t stacksize;
8134
8135 if (server.vm_max_threads != 0)
8136 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8137
8138 expandVmSwapFilename();
8139 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8140 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8141 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8142 }
8143 if (server.vm_fp == NULL) {
8144 redisLog(REDIS_WARNING,
8145 "Impossible to open the swap file: %s. Exiting.",
8146 strerror(errno));
8147 exit(1);
8148 }
8149 server.vm_fd = fileno(server.vm_fp);
8150 server.vm_next_page = 0;
8151 server.vm_near_pages = 0;
8152 server.vm_stats_used_pages = 0;
8153 server.vm_stats_swapped_objects = 0;
8154 server.vm_stats_swapouts = 0;
8155 server.vm_stats_swapins = 0;
8156 totsize = server.vm_pages*server.vm_page_size;
8157 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8158 if (ftruncate(server.vm_fd,totsize) == -1) {
8159 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8160 strerror(errno));
8161 exit(1);
8162 } else {
8163 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8164 }
8165 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8166 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
8167 (long long) (server.vm_pages+7)/8, server.vm_pages);
8168 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8169
8170 /* Initialize threaded I/O (used by Virtual Memory) */
8171 server.io_newjobs = listCreate();
8172 server.io_processing = listCreate();
8173 server.io_processed = listCreate();
8174 server.io_ready_clients = listCreate();
8175 pthread_mutex_init(&server.io_mutex,NULL);
8176 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8177 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8178 server.io_active_threads = 0;
8179 if (pipe(pipefds) == -1) {
8180 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8181 ,strerror(errno));
8182 exit(1);
8183 }
8184 server.io_ready_pipe_read = pipefds[0];
8185 server.io_ready_pipe_write = pipefds[1];
8186 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8187 /* LZF requires a lot of stack */
8188 pthread_attr_init(&server.io_threads_attr);
8189 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8190 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8191 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8192 /* Listen for events in the threaded I/O pipe */
8193 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8194 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8195 oom("creating file event");
8196 }
8197
8198 /* Mark the page as used */
8199 static void vmMarkPageUsed(off_t page) {
8200 off_t byte = page/8;
8201 int bit = page&7;
8202 redisAssert(vmFreePage(page) == 1);
8203 server.vm_bitmap[byte] |= 1<<bit;
8204 }
8205
8206 /* Mark N contiguous pages as used, with 'page' being the first. */
8207 static void vmMarkPagesUsed(off_t page, off_t count) {
8208 off_t j;
8209
8210 for (j = 0; j < count; j++)
8211 vmMarkPageUsed(page+j);
8212 server.vm_stats_used_pages += count;
8213 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8214 (long long)count, (long long)page);
8215 }
8216
8217 /* Mark the page as free */
8218 static void vmMarkPageFree(off_t page) {
8219 off_t byte = page/8;
8220 int bit = page&7;
8221 redisAssert(vmFreePage(page) == 0);
8222 server.vm_bitmap[byte] &= ~(1<<bit);
8223 }
8224
8225 /* Mark N contiguous pages as free, with 'page' being the first. */
8226 static void vmMarkPagesFree(off_t page, off_t count) {
8227 off_t j;
8228
8229 for (j = 0; j < count; j++)
8230 vmMarkPageFree(page+j);
8231 server.vm_stats_used_pages -= count;
8232 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8233 (long long)count, (long long)page);
8234 }
8235
8236 /* Test if the page is free */
8237 static int vmFreePage(off_t page) {
8238 off_t byte = page/8;
8239 int bit = page&7;
8240 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
8241 }
8242
8243 /* Find N contiguous free pages storing the first page of the cluster in *first.
8244 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8245 * REDIS_ERR is returned.
8246 *
8247 * This function uses a simple algorithm: we try to allocate
8248 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8249 * again from the start of the swap file searching for free spaces.
8250 *
8251 * If it looks pretty clear that there are no free pages near our offset
8252 * we try to find less populated places doing a forward jump of
8253 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8254 * without hurry, and then we jump again and so forth...
8255 *
8256 * This function can be improved using a free list to avoid to guess
8257 * too much, since we could collect data about freed pages.
8258 *
8259 * note: I implemented this function just after watching an episode of
8260 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8261 */
8262 static int vmFindContiguousPages(off_t *first, off_t n) {
8263 off_t base, offset = 0, since_jump = 0, numfree = 0;
8264
8265 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8266 server.vm_near_pages = 0;
8267 server.vm_next_page = 0;
8268 }
8269 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8270 base = server.vm_next_page;
8271
8272 while(offset < server.vm_pages) {
8273 off_t this = base+offset;
8274
8275 /* If we overflow, restart from page zero */
8276 if (this >= server.vm_pages) {
8277 this -= server.vm_pages;
8278 if (this == 0) {
8279 /* Just overflowed, what we found on tail is no longer
8280 * interesting, as it's no longer contiguous. */
8281 numfree = 0;
8282 }
8283 }
8284 if (vmFreePage(this)) {
8285 /* This is a free page */
8286 numfree++;
8287 /* Already got N free pages? Return to the caller, with success */
8288 if (numfree == n) {
8289 *first = this-(n-1);
8290 server.vm_next_page = this+1;
8291 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
8292 return REDIS_OK;
8293 }
8294 } else {
8295 /* The current one is not a free page */
8296 numfree = 0;
8297 }
8298
8299 /* Fast-forward if the current page is not free and we already
8300 * searched enough near this place. */
8301 since_jump++;
8302 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8303 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8304 since_jump = 0;
8305 /* Note that even if we rewind after the jump, we are don't need
8306 * to make sure numfree is set to zero as we only jump *if* it
8307 * is set to zero. */
8308 } else {
8309 /* Otherwise just check the next page */
8310 offset++;
8311 }
8312 }
8313 return REDIS_ERR;
8314 }
8315
8316 /* Write the specified object at the specified page of the swap file */
8317 static int vmWriteObjectOnSwap(robj *o, off_t page) {
8318 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8319 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8320 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8321 redisLog(REDIS_WARNING,
8322 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8323 strerror(errno));
8324 return REDIS_ERR;
8325 }
8326 rdbSaveObject(server.vm_fp,o);
8327 fflush(server.vm_fp);
8328 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8329 return REDIS_OK;
8330 }
8331
8332 /* Swap the 'val' object relative to 'key' into disk. Store all the information
8333 * needed to later retrieve the object into the key object.
8334 * If we can't find enough contiguous empty pages to swap the object on disk
8335 * REDIS_ERR is returned. */
8336 static int vmSwapObjectBlocking(robj *key, robj *val) {
8337 off_t pages = rdbSavedObjectPages(val,NULL);
8338 off_t page;
8339
8340 assert(key->storage == REDIS_VM_MEMORY);
8341 assert(key->refcount == 1);
8342 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8343 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
8344 key->vm.page = page;
8345 key->vm.usedpages = pages;
8346 key->storage = REDIS_VM_SWAPPED;
8347 key->vtype = val->type;
8348 decrRefCount(val); /* Deallocate the object from memory. */
8349 vmMarkPagesUsed(page,pages);
8350 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8351 (unsigned char*) key->ptr,
8352 (unsigned long long) page, (unsigned long long) pages);
8353 server.vm_stats_swapped_objects++;
8354 server.vm_stats_swapouts++;
8355 return REDIS_OK;
8356 }
8357
8358 static robj *vmReadObjectFromSwap(off_t page, int type) {
8359 robj *o;
8360
8361 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8362 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8363 redisLog(REDIS_WARNING,
8364 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
8365 strerror(errno));
8366 _exit(1);
8367 }
8368 o = rdbLoadObject(type,server.vm_fp);
8369 if (o == NULL) {
8370 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8371 _exit(1);
8372 }
8373 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8374 return o;
8375 }
8376
8377 /* Load the value object relative to the 'key' object from swap to memory.
8378 * The newly allocated object is returned.
8379 *
8380 * If preview is true the unserialized object is returned to the caller but
8381 * no changes are made to the key object, nor the pages are marked as freed */
8382 static robj *vmGenericLoadObject(robj *key, int preview) {
8383 robj *val;
8384
8385 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8386 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
8387 if (!preview) {
8388 key->storage = REDIS_VM_MEMORY;
8389 key->vm.atime = server.unixtime;
8390 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8391 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8392 (unsigned char*) key->ptr);
8393 server.vm_stats_swapped_objects--;
8394 } else {
8395 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8396 (unsigned char*) key->ptr);
8397 }
8398 server.vm_stats_swapins++;
8399 return val;
8400 }
8401
8402 /* Plain object loading, from swap to memory */
8403 static robj *vmLoadObject(robj *key) {
8404 /* If we are loading the object in background, stop it, we
8405 * need to load this object synchronously ASAP. */
8406 if (key->storage == REDIS_VM_LOADING)
8407 vmCancelThreadedIOJob(key);
8408 return vmGenericLoadObject(key,0);
8409 }
8410
8411 /* Just load the value on disk, without to modify the key.
8412 * This is useful when we want to perform some operation on the value
8413 * without to really bring it from swap to memory, like while saving the
8414 * dataset or rewriting the append only log. */
8415 static robj *vmPreviewObject(robj *key) {
8416 return vmGenericLoadObject(key,1);
8417 }
8418
8419 /* How a good candidate is this object for swapping?
8420 * The better candidate it is, the greater the returned value.
8421 *
8422 * Currently we try to perform a fast estimation of the object size in
8423 * memory, and combine it with aging informations.
8424 *
8425 * Basically swappability = idle-time * log(estimated size)
8426 *
8427 * Bigger objects are preferred over smaller objects, but not
8428 * proportionally, this is why we use the logarithm. This algorithm is
8429 * just a first try and will probably be tuned later. */
8430 static double computeObjectSwappability(robj *o) {
8431 time_t age = server.unixtime - o->vm.atime;
8432 long asize = 0;
8433 list *l;
8434 dict *d;
8435 struct dictEntry *de;
8436 int z;
8437
8438 if (age <= 0) return 0;
8439 switch(o->type) {
8440 case REDIS_STRING:
8441 if (o->encoding != REDIS_ENCODING_RAW) {
8442 asize = sizeof(*o);
8443 } else {
8444 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8445 }
8446 break;
8447 case REDIS_LIST:
8448 l = o->ptr;
8449 listNode *ln = listFirst(l);
8450
8451 asize = sizeof(list);
8452 if (ln) {
8453 robj *ele = ln->value;
8454 long elesize;
8455
8456 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8457 (sizeof(*o)+sdslen(ele->ptr)) :
8458 sizeof(*o);
8459 asize += (sizeof(listNode)+elesize)*listLength(l);
8460 }
8461 break;
8462 case REDIS_SET:
8463 case REDIS_ZSET:
8464 z = (o->type == REDIS_ZSET);
8465 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8466
8467 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8468 if (z) asize += sizeof(zset)-sizeof(dict);
8469 if (dictSize(d)) {
8470 long elesize;
8471 robj *ele;
8472
8473 de = dictGetRandomKey(d);
8474 ele = dictGetEntryKey(de);
8475 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8476 (sizeof(*o)+sdslen(ele->ptr)) :
8477 sizeof(*o);
8478 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8479 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8480 }
8481 break;
8482 case REDIS_HASH:
8483 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8484 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8485 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8486 unsigned int klen, vlen;
8487 unsigned char *key, *val;
8488
8489 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8490 klen = 0;
8491 vlen = 0;
8492 }
8493 asize = len*(klen+vlen+3);
8494 } else if (o->encoding == REDIS_ENCODING_HT) {
8495 d = o->ptr;
8496 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8497 if (dictSize(d)) {
8498 long elesize;
8499 robj *ele;
8500
8501 de = dictGetRandomKey(d);
8502 ele = dictGetEntryKey(de);
8503 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8504 (sizeof(*o)+sdslen(ele->ptr)) :
8505 sizeof(*o);
8506 ele = dictGetEntryVal(de);
8507 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8508 (sizeof(*o)+sdslen(ele->ptr)) :
8509 sizeof(*o);
8510 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8511 }
8512 }
8513 break;
8514 }
8515 return (double)age*log(1+asize);
8516 }
8517
8518 /* Try to swap an object that's a good candidate for swapping.
8519 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8520 * to swap any object at all.
8521 *
8522 * If 'usethreaded' is true, Redis will try to swap the object in background
8523 * using I/O threads. */
8524 static int vmSwapOneObject(int usethreads) {
8525 int j, i;
8526 struct dictEntry *best = NULL;
8527 double best_swappability = 0;
8528 redisDb *best_db = NULL;
8529 robj *key, *val;
8530
8531 for (j = 0; j < server.dbnum; j++) {
8532 redisDb *db = server.db+j;
8533 /* Why maxtries is set to 100?
8534 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8535 * are swappable objects */
8536 int maxtries = 100;
8537
8538 if (dictSize(db->dict) == 0) continue;
8539 for (i = 0; i < 5; i++) {
8540 dictEntry *de;
8541 double swappability;
8542
8543 if (maxtries) maxtries--;
8544 de = dictGetRandomKey(db->dict);
8545 key = dictGetEntryKey(de);
8546 val = dictGetEntryVal(de);
8547 /* Only swap objects that are currently in memory.
8548 *
8549 * Also don't swap shared objects if threaded VM is on, as we
8550 * try to ensure that the main thread does not touch the
8551 * object while the I/O thread is using it, but we can't
8552 * control other keys without adding additional mutex. */
8553 if (key->storage != REDIS_VM_MEMORY ||
8554 (server.vm_max_threads != 0 && val->refcount != 1)) {
8555 if (maxtries) i--; /* don't count this try */
8556 continue;
8557 }
8558 swappability = computeObjectSwappability(val);
8559 if (!best || swappability > best_swappability) {
8560 best = de;
8561 best_swappability = swappability;
8562 best_db = db;
8563 }
8564 }
8565 }
8566 if (best == NULL) return REDIS_ERR;
8567 key = dictGetEntryKey(best);
8568 val = dictGetEntryVal(best);
8569
8570 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
8571 key->ptr, best_swappability);
8572
8573 /* Unshare the key if needed */
8574 if (key->refcount > 1) {
8575 robj *newkey = dupStringObject(key);
8576 decrRefCount(key);
8577 key = dictGetEntryKey(best) = newkey;
8578 }
8579 /* Swap it */
8580 if (usethreads) {
8581 vmSwapObjectThreaded(key,val,best_db);
8582 return REDIS_OK;
8583 } else {
8584 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8585 dictGetEntryVal(best) = NULL;
8586 return REDIS_OK;
8587 } else {
8588 return REDIS_ERR;
8589 }
8590 }
8591 }
8592
8593 static int vmSwapOneObjectBlocking() {
8594 return vmSwapOneObject(0);
8595 }
8596
8597 static int vmSwapOneObjectThreaded() {
8598 return vmSwapOneObject(1);
8599 }
8600
8601 /* Return true if it's safe to swap out objects in a given moment.
8602 * Basically we don't want to swap objects out while there is a BGSAVE
8603 * or a BGAEOREWRITE running in backgroud. */
8604 static int vmCanSwapOut(void) {
8605 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8606 }
8607
8608 /* Delete a key if swapped. Returns 1 if the key was found, was swapped
8609 * and was deleted. Otherwise 0 is returned. */
8610 static int deleteIfSwapped(redisDb *db, robj *key) {
8611 dictEntry *de;
8612 robj *foundkey;
8613
8614 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8615 foundkey = dictGetEntryKey(de);
8616 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8617 deleteKey(db,key);
8618 return 1;
8619 }
8620
8621 /* =================== Virtual Memory - Threaded I/O ======================= */
8622
8623 static void freeIOJob(iojob *j) {
8624 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8625 j->type == REDIS_IOJOB_DO_SWAP ||
8626 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8627 decrRefCount(j->val);
8628 decrRefCount(j->key);
8629 zfree(j);
8630 }
8631
8632 /* Every time a thread finished a Job, it writes a byte into the write side
8633 * of an unix pipe in order to "awake" the main thread, and this function
8634 * is called. */
8635 static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8636 int mask)
8637 {
8638 char buf[1];
8639 int retval, processed = 0, toprocess = -1, trytoswap = 1;
8640 REDIS_NOTUSED(el);
8641 REDIS_NOTUSED(mask);
8642 REDIS_NOTUSED(privdata);
8643
8644 /* For every byte we read in the read side of the pipe, there is one
8645 * I/O job completed to process. */
8646 while((retval = read(fd,buf,1)) == 1) {
8647 iojob *j;
8648 listNode *ln;
8649 robj *key;
8650 struct dictEntry *de;
8651
8652 redisLog(REDIS_DEBUG,"Processing I/O completed job");
8653
8654 /* Get the processed element (the oldest one) */
8655 lockThreadedIO();
8656 assert(listLength(server.io_processed) != 0);
8657 if (toprocess == -1) {
8658 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8659 if (toprocess <= 0) toprocess = 1;
8660 }
8661 ln = listFirst(server.io_processed);
8662 j = ln->value;
8663 listDelNode(server.io_processed,ln);
8664 unlockThreadedIO();
8665 /* If this job is marked as canceled, just ignore it */
8666 if (j->canceled) {
8667 freeIOJob(j);
8668 continue;
8669 }
8670 /* Post process it in the main thread, as there are things we
8671 * can do just here to avoid race conditions and/or invasive locks */
8672 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8673 de = dictFind(j->db->dict,j->key);
8674 assert(de != NULL);
8675 key = dictGetEntryKey(de);
8676 if (j->type == REDIS_IOJOB_LOAD) {
8677 redisDb *db;
8678
8679 /* Key loaded, bring it at home */
8680 key->storage = REDIS_VM_MEMORY;
8681 key->vm.atime = server.unixtime;
8682 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8683 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8684 (unsigned char*) key->ptr);
8685 server.vm_stats_swapped_objects--;
8686 server.vm_stats_swapins++;
8687 dictGetEntryVal(de) = j->val;
8688 incrRefCount(j->val);
8689 db = j->db;
8690 freeIOJob(j);
8691 /* Handle clients waiting for this key to be loaded. */
8692 handleClientsBlockedOnSwappedKey(db,key);
8693 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8694 /* Now we know the amount of pages required to swap this object.
8695 * Let's find some space for it, and queue this task again
8696 * rebranded as REDIS_IOJOB_DO_SWAP. */
8697 if (!vmCanSwapOut() ||
8698 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8699 {
8700 /* Ooops... no space or we can't swap as there is
8701 * a fork()ed Redis trying to save stuff on disk. */
8702 freeIOJob(j);
8703 key->storage = REDIS_VM_MEMORY; /* undo operation */
8704 } else {
8705 /* Note that we need to mark this pages as used now,
8706 * if the job will be canceled, we'll mark them as freed
8707 * again. */
8708 vmMarkPagesUsed(j->page,j->pages);
8709 j->type = REDIS_IOJOB_DO_SWAP;
8710 lockThreadedIO();
8711 queueIOJob(j);
8712 unlockThreadedIO();
8713 }
8714 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8715 robj *val;
8716
8717 /* Key swapped. We can finally free some memory. */
8718 if (key->storage != REDIS_VM_SWAPPING) {
8719 printf("key->storage: %d\n",key->storage);
8720 printf("key->name: %s\n",(char*)key->ptr);
8721 printf("key->refcount: %d\n",key->refcount);
8722 printf("val: %p\n",(void*)j->val);
8723 printf("val->type: %d\n",j->val->type);
8724 printf("val->ptr: %s\n",(char*)j->val->ptr);
8725 }
8726 redisAssert(key->storage == REDIS_VM_SWAPPING);
8727 val = dictGetEntryVal(de);
8728 key->vm.page = j->page;
8729 key->vm.usedpages = j->pages;
8730 key->storage = REDIS_VM_SWAPPED;
8731 key->vtype = j->val->type;
8732 decrRefCount(val); /* Deallocate the object from memory. */
8733 dictGetEntryVal(de) = NULL;
8734 redisLog(REDIS_DEBUG,
8735 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8736 (unsigned char*) key->ptr,
8737 (unsigned long long) j->page, (unsigned long long) j->pages);
8738 server.vm_stats_swapped_objects++;
8739 server.vm_stats_swapouts++;
8740 freeIOJob(j);
8741 /* Put a few more swap requests in queue if we are still
8742 * out of memory */
8743 if (trytoswap && vmCanSwapOut() &&
8744 zmalloc_used_memory() > server.vm_max_memory)
8745 {
8746 int more = 1;
8747 while(more) {
8748 lockThreadedIO();
8749 more = listLength(server.io_newjobs) <
8750 (unsigned) server.vm_max_threads;
8751 unlockThreadedIO();
8752 /* Don't waste CPU time if swappable objects are rare. */
8753 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8754 trytoswap = 0;
8755 break;
8756 }
8757 }
8758 }
8759 }
8760 processed++;
8761 if (processed == toprocess) return;
8762 }
8763 if (retval < 0 && errno != EAGAIN) {
8764 redisLog(REDIS_WARNING,
8765 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8766 strerror(errno));
8767 }
8768 }
8769
8770 static void lockThreadedIO(void) {
8771 pthread_mutex_lock(&server.io_mutex);
8772 }
8773
8774 static void unlockThreadedIO(void) {
8775 pthread_mutex_unlock(&server.io_mutex);
8776 }
8777
8778 /* Remove the specified object from the threaded I/O queue if still not
8779 * processed, otherwise make sure to flag it as canceled. */
8780 static void vmCancelThreadedIOJob(robj *o) {
8781 list *lists[3] = {
8782 server.io_newjobs, /* 0 */
8783 server.io_processing, /* 1 */
8784 server.io_processed /* 2 */
8785 };
8786 int i;
8787
8788 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
8789 again:
8790 lockThreadedIO();
8791 /* Search for a matching key in one of the queues */
8792 for (i = 0; i < 3; i++) {
8793 listNode *ln;
8794 listIter li;
8795
8796 listRewind(lists[i],&li);
8797 while ((ln = listNext(&li)) != NULL) {
8798 iojob *job = ln->value;
8799
8800 if (job->canceled) continue; /* Skip this, already canceled. */
8801 if (compareStringObjects(job->key,o) == 0) {
8802 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8803 (void*)job, (char*)o->ptr, job->type, i);
8804 /* Mark the pages as free since the swap didn't happened
8805 * or happened but is now discarded. */
8806 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
8807 vmMarkPagesFree(job->page,job->pages);
8808 /* Cancel the job. It depends on the list the job is
8809 * living in. */
8810 switch(i) {
8811 case 0: /* io_newjobs */
8812 /* If the job was yet not processed the best thing to do
8813 * is to remove it from the queue at all */
8814 freeIOJob(job);
8815 listDelNode(lists[i],ln);
8816 break;
8817 case 1: /* io_processing */
8818 /* Oh Shi- the thread is messing with the Job:
8819 *
8820 * Probably it's accessing the object if this is a
8821 * PREPARE_SWAP or DO_SWAP job.
8822 * If it's a LOAD job it may be reading from disk and
8823 * if we don't wait for the job to terminate before to
8824 * cancel it, maybe in a few microseconds data can be
8825 * corrupted in this pages. So the short story is:
8826 *
8827 * Better to wait for the job to move into the
8828 * next queue (processed)... */
8829
8830 /* We try again and again until the job is completed. */
8831 unlockThreadedIO();
8832 /* But let's wait some time for the I/O thread
8833 * to finish with this job. After all this condition
8834 * should be very rare. */
8835 usleep(1);
8836 goto again;
8837 case 2: /* io_processed */
8838 /* The job was already processed, that's easy...
8839 * just mark it as canceled so that we'll ignore it
8840 * when processing completed jobs. */
8841 job->canceled = 1;
8842 break;
8843 }
8844 /* Finally we have to adjust the storage type of the object
8845 * in order to "UNDO" the operaiton. */
8846 if (o->storage == REDIS_VM_LOADING)
8847 o->storage = REDIS_VM_SWAPPED;
8848 else if (o->storage == REDIS_VM_SWAPPING)
8849 o->storage = REDIS_VM_MEMORY;
8850 unlockThreadedIO();
8851 return;
8852 }
8853 }
8854 }
8855 unlockThreadedIO();
8856 assert(1 != 1); /* We should never reach this */
8857 }
8858
8859 static void *IOThreadEntryPoint(void *arg) {
8860 iojob *j;
8861 listNode *ln;
8862 REDIS_NOTUSED(arg);
8863
8864 pthread_detach(pthread_self());
8865 while(1) {
8866 /* Get a new job to process */
8867 lockThreadedIO();
8868 if (listLength(server.io_newjobs) == 0) {
8869 /* No new jobs in queue, exit. */
8870 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8871 (long) pthread_self());
8872 server.io_active_threads--;
8873 unlockThreadedIO();
8874 return NULL;
8875 }
8876 ln = listFirst(server.io_newjobs);
8877 j = ln->value;
8878 listDelNode(server.io_newjobs,ln);
8879 /* Add the job in the processing queue */
8880 j->thread = pthread_self();
8881 listAddNodeTail(server.io_processing,j);
8882 ln = listLast(server.io_processing); /* We use ln later to remove it */
8883 unlockThreadedIO();
8884 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8885 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
8886
8887 /* Process the Job */
8888 if (j->type == REDIS_IOJOB_LOAD) {
8889 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
8890 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8891 FILE *fp = fopen("/dev/null","w+");
8892 j->pages = rdbSavedObjectPages(j->val,fp);
8893 fclose(fp);
8894 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8895 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8896 j->canceled = 1;
8897 }
8898
8899 /* Done: insert the job into the processed queue */
8900 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8901 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
8902 lockThreadedIO();
8903 listDelNode(server.io_processing,ln);
8904 listAddNodeTail(server.io_processed,j);
8905 unlockThreadedIO();
8906
8907 /* Signal the main thread there is new stuff to process */
8908 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8909 }
8910 return NULL; /* never reached */
8911 }
8912
8913 static void spawnIOThread(void) {
8914 pthread_t thread;
8915 sigset_t mask, omask;
8916 int err;
8917
8918 sigemptyset(&mask);
8919 sigaddset(&mask,SIGCHLD);
8920 sigaddset(&mask,SIGHUP);
8921 sigaddset(&mask,SIGPIPE);
8922 pthread_sigmask(SIG_SETMASK, &mask, &omask);
8923 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8924 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8925 strerror(err));
8926 usleep(1000000);
8927 }
8928 pthread_sigmask(SIG_SETMASK, &omask, NULL);
8929 server.io_active_threads++;
8930 }
8931
8932 /* We need to wait for the last thread to exit before we are able to
8933 * fork() in order to BGSAVE or BGREWRITEAOF. */
8934 static void waitEmptyIOJobsQueue(void) {
8935 while(1) {
8936 int io_processed_len;
8937
8938 lockThreadedIO();
8939 if (listLength(server.io_newjobs) == 0 &&
8940 listLength(server.io_processing) == 0 &&
8941 server.io_active_threads == 0)
8942 {
8943 unlockThreadedIO();
8944 return;
8945 }
8946 /* While waiting for empty jobs queue condition we post-process some
8947 * finshed job, as I/O threads may be hanging trying to write against
8948 * the io_ready_pipe_write FD but there are so much pending jobs that
8949 * it's blocking. */
8950 io_processed_len = listLength(server.io_processed);
8951 unlockThreadedIO();
8952 if (io_processed_len) {
8953 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8954 usleep(1000); /* 1 millisecond */
8955 } else {
8956 usleep(10000); /* 10 milliseconds */
8957 }
8958 }
8959 }
8960
8961 static void vmReopenSwapFile(void) {
8962 /* Note: we don't close the old one as we are in the child process
8963 * and don't want to mess at all with the original file object. */
8964 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8965 if (server.vm_fp == NULL) {
8966 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8967 server.vm_swap_file);
8968 _exit(1);
8969 }
8970 server.vm_fd = fileno(server.vm_fp);
8971 }
8972
8973 /* This function must be called while with threaded IO locked */
8974 static void queueIOJob(iojob *j) {
8975 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8976 (void*)j, j->type, (char*)j->key->ptr);
8977 listAddNodeTail(server.io_newjobs,j);
8978 if (server.io_active_threads < server.vm_max_threads)
8979 spawnIOThread();
8980 }
8981
8982 static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8983 iojob *j;
8984
8985 assert(key->storage == REDIS_VM_MEMORY);
8986 assert(key->refcount == 1);
8987
8988 j = zmalloc(sizeof(*j));
8989 j->type = REDIS_IOJOB_PREPARE_SWAP;
8990 j->db = db;
8991 j->key = dupStringObject(key);
8992 j->val = val;
8993 incrRefCount(val);
8994 j->canceled = 0;
8995 j->thread = (pthread_t) -1;
8996 key->storage = REDIS_VM_SWAPPING;
8997
8998 lockThreadedIO();
8999 queueIOJob(j);
9000 unlockThreadedIO();
9001 return REDIS_OK;
9002 }
9003
9004 /* ============ Virtual Memory - Blocking clients on missing keys =========== */
9005
9006 /* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9007 * If there is not already a job loading the key, it is craeted.
9008 * The key is added to the io_keys list in the client structure, and also
9009 * in the hash table mapping swapped keys to waiting clients, that is,
9010 * server.io_waited_keys. */
9011 static int waitForSwappedKey(redisClient *c, robj *key) {
9012 struct dictEntry *de;
9013 robj *o;
9014 list *l;
9015
9016 /* If the key does not exist or is already in RAM we don't need to
9017 * block the client at all. */
9018 de = dictFind(c->db->dict,key);
9019 if (de == NULL) return 0;
9020 o = dictGetEntryKey(de);
9021 if (o->storage == REDIS_VM_MEMORY) {
9022 return 0;
9023 } else if (o->storage == REDIS_VM_SWAPPING) {
9024 /* We were swapping the key, undo it! */
9025 vmCancelThreadedIOJob(o);
9026 return 0;
9027 }
9028
9029 /* OK: the key is either swapped, or being loaded just now. */
9030
9031 /* Add the key to the list of keys this client is waiting for.
9032 * This maps clients to keys they are waiting for. */
9033 listAddNodeTail(c->io_keys,key);
9034 incrRefCount(key);
9035
9036 /* Add the client to the swapped keys => clients waiting map. */
9037 de = dictFind(c->db->io_keys,key);
9038 if (de == NULL) {
9039 int retval;
9040
9041 /* For every key we take a list of clients blocked for it */
9042 l = listCreate();
9043 retval = dictAdd(c->db->io_keys,key,l);
9044 incrRefCount(key);
9045 assert(retval == DICT_OK);
9046 } else {
9047 l = dictGetEntryVal(de);
9048 }
9049 listAddNodeTail(l,c);
9050
9051 /* Are we already loading the key from disk? If not create a job */
9052 if (o->storage == REDIS_VM_SWAPPED) {
9053 iojob *j;
9054
9055 o->storage = REDIS_VM_LOADING;
9056 j = zmalloc(sizeof(*j));
9057 j->type = REDIS_IOJOB_LOAD;
9058 j->db = c->db;
9059 j->key = dupStringObject(key);
9060 j->key->vtype = o->vtype;
9061 j->page = o->vm.page;
9062 j->val = NULL;
9063 j->canceled = 0;
9064 j->thread = (pthread_t) -1;
9065 lockThreadedIO();
9066 queueIOJob(j);
9067 unlockThreadedIO();
9068 }
9069 return 1;
9070 }
9071
9072 /* Preload keys needed for the ZUNION and ZINTER commands. */
9073 static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9074 int i, num;
9075 num = atoi(c->argv[2]->ptr);
9076 for (i = 0; i < num; i++) {
9077 waitForSwappedKey(c,c->argv[3+i]);
9078 }
9079 }
9080
9081 /* Is this client attempting to run a command against swapped keys?
9082 * If so, block it ASAP, load the keys in background, then resume it.
9083 *
9084 * The important idea about this function is that it can fail! If keys will
9085 * still be swapped when the client is resumed, this key lookups will
9086 * just block loading keys from disk. In practical terms this should only
9087 * happen with SORT BY command or if there is a bug in this function.
9088 *
9089 * Return 1 if the client is marked as blocked, 0 if the client can
9090 * continue as the keys it is going to access appear to be in memory. */
9091 static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9092 int j, last;
9093
9094 if (cmd->vm_preload_proc != NULL) {
9095 cmd->vm_preload_proc(c);
9096 } else {
9097 if (cmd->vm_firstkey == 0) return 0;
9098 last = cmd->vm_lastkey;
9099 if (last < 0) last = c->argc+last;
9100 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9101 waitForSwappedKey(c,c->argv[j]);
9102 }
9103
9104 /* If the client was blocked for at least one key, mark it as blocked. */
9105 if (listLength(c->io_keys)) {
9106 c->flags |= REDIS_IO_WAIT;
9107 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9108 server.vm_blocked_clients++;
9109 return 1;
9110 } else {
9111 return 0;
9112 }
9113 }
9114
9115 /* Remove the 'key' from the list of blocked keys for a given client.
9116 *
9117 * The function returns 1 when there are no longer blocking keys after
9118 * the current one was removed (and the client can be unblocked). */
9119 static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9120 list *l;
9121 listNode *ln;
9122 listIter li;
9123 struct dictEntry *de;
9124
9125 /* Remove the key from the list of keys this client is waiting for. */
9126 listRewind(c->io_keys,&li);
9127 while ((ln = listNext(&li)) != NULL) {
9128 if (compareStringObjects(ln->value,key) == 0) {
9129 listDelNode(c->io_keys,ln);
9130 break;
9131 }
9132 }
9133 assert(ln != NULL);
9134
9135 /* Remove the client form the key => waiting clients map. */
9136 de = dictFind(c->db->io_keys,key);
9137 assert(de != NULL);
9138 l = dictGetEntryVal(de);
9139 ln = listSearchKey(l,c);
9140 assert(ln != NULL);
9141 listDelNode(l,ln);
9142 if (listLength(l) == 0)
9143 dictDelete(c->db->io_keys,key);
9144
9145 return listLength(c->io_keys) == 0;
9146 }
9147
9148 static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9149 struct dictEntry *de;
9150 list *l;
9151 listNode *ln;
9152 int len;
9153
9154 de = dictFind(db->io_keys,key);
9155 if (!de) return;
9156
9157 l = dictGetEntryVal(de);
9158 len = listLength(l);
9159 /* Note: we can't use something like while(listLength(l)) as the list
9160 * can be freed by the calling function when we remove the last element. */
9161 while (len--) {
9162 ln = listFirst(l);
9163 redisClient *c = ln->value;
9164
9165 if (dontWaitForSwappedKey(c,key)) {
9166 /* Put the client in the list of clients ready to go as we
9167 * loaded all the keys about it. */
9168 listAddNodeTail(server.io_ready_clients,c);
9169 }
9170 }
9171 }
9172
9173 /* =========================== Remote Configuration ========================= */
9174
9175 static void configSetCommand(redisClient *c) {
9176 robj *o = getDecodedObject(c->argv[3]);
9177 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9178 zfree(server.dbfilename);
9179 server.dbfilename = zstrdup(o->ptr);
9180 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9181 zfree(server.requirepass);
9182 server.requirepass = zstrdup(o->ptr);
9183 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9184 zfree(server.masterauth);
9185 server.masterauth = zstrdup(o->ptr);
9186 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9187 server.maxmemory = strtoll(o->ptr, NULL, 10);
9188 } else {
9189 addReplySds(c,sdscatprintf(sdsempty(),
9190 "-ERR not supported CONFIG parameter %s\r\n",
9191 (char*)c->argv[2]->ptr));
9192 decrRefCount(o);
9193 return;
9194 }
9195 decrRefCount(o);
9196 addReply(c,shared.ok);
9197 }
9198
9199 static void configGetCommand(redisClient *c) {
9200 robj *o = getDecodedObject(c->argv[2]);
9201 robj *lenobj = createObject(REDIS_STRING,NULL);
9202 char *pattern = o->ptr;
9203 int matches = 0;
9204
9205 addReply(c,lenobj);
9206 decrRefCount(lenobj);
9207
9208 if (stringmatch(pattern,"dbfilename",0)) {
9209 addReplyBulkCString(c,"dbfilename");
9210 addReplyBulkCString(c,server.dbfilename);
9211 matches++;
9212 }
9213 if (stringmatch(pattern,"requirepass",0)) {
9214 addReplyBulkCString(c,"requirepass");
9215 addReplyBulkCString(c,server.requirepass);
9216 matches++;
9217 }
9218 if (stringmatch(pattern,"masterauth",0)) {
9219 addReplyBulkCString(c,"masterauth");
9220 addReplyBulkCString(c,server.masterauth);
9221 matches++;
9222 }
9223 if (stringmatch(pattern,"maxmemory",0)) {
9224 char buf[128];
9225
9226 snprintf(buf,128,"%llu\n",server.maxmemory);
9227 addReplyBulkCString(c,"maxmemory");
9228 addReplyBulkCString(c,buf);
9229 matches++;
9230 }
9231 decrRefCount(o);
9232 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9233 }
9234
9235 static void configCommand(redisClient *c) {
9236 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9237 if (c->argc != 4) goto badarity;
9238 configSetCommand(c);
9239 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9240 if (c->argc != 3) goto badarity;
9241 configGetCommand(c);
9242 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9243 if (c->argc != 2) goto badarity;
9244 server.stat_numcommands = 0;
9245 server.stat_numconnections = 0;
9246 server.stat_expiredkeys = 0;
9247 server.stat_starttime = time(NULL);
9248 addReply(c,shared.ok);
9249 } else {
9250 addReplySds(c,sdscatprintf(sdsempty(),
9251 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9252 }
9253 return;
9254
9255 badarity:
9256 addReplySds(c,sdscatprintf(sdsempty(),
9257 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9258 (char*) c->argv[1]->ptr));
9259 }
9260
9261 /* =========================== Pubsub implementation ======================== */
9262
9263 /* Subscribe a client to a class. Returns 1 if the operation succeeded, or
9264 * 0 if the client was already subscribed to that class. */
9265 static int pubsubSubscribe(redisClient *c, robj *class) {
9266 struct dictEntry *de;
9267 list *clients = NULL;
9268 int retval = 0;
9269
9270 /* Add the class to the client -> classes hash table */
9271 if (dictAdd(c->pubsub_classes,class,NULL) == DICT_OK) {
9272 retval = 1;
9273 incrRefCount(class);
9274 /* Add the client to the class -> list of clients hash table */
9275 de = dictFind(server.pubsub_classes,class);
9276 if (de == NULL) {
9277 clients = listCreate();
9278 dictAdd(server.pubsub_classes,class,clients);
9279 incrRefCount(class);
9280 } else {
9281 clients = dictGetEntryVal(de);
9282 }
9283 listAddNodeTail(clients,c);
9284 }
9285 /* Notify the client */
9286 addReply(c,shared.mbulk3);
9287 addReply(c,shared.subscribebulk);
9288 addReplyBulk(c,class);
9289 addReplyLong(c,dictSize(c->pubsub_classes));
9290 return retval;
9291 }
9292
9293 /* Unsubscribe a client from a class. Returns 1 if the operation succeeded, or
9294 * 0 if the client was not subscribed to the specified class. */
9295 static int pubsubUnsubscribe(redisClient *c, robj *class, int notify) {
9296 struct dictEntry *de;
9297 list *clients;
9298 listNode *ln;
9299 int retval = 0;
9300
9301 /* Remove the class from the client -> classes hash table */
9302 if (dictDelete(c->pubsub_classes,class) == DICT_OK) {
9303 retval = 1;
9304 /* Remove the client from the class -> clients list hash table */
9305 de = dictFind(server.pubsub_classes,class);
9306 assert(de != NULL);
9307 clients = dictGetEntryVal(de);
9308 ln = listSearchKey(clients,c);
9309 assert(ln != NULL);
9310 listDelNode(clients,ln);
9311 }
9312 /* Notify the client */
9313 if (notify) {
9314 addReply(c,shared.mbulk3);
9315 addReply(c,shared.unsubscribebulk);
9316 addReplyBulk(c,class);
9317 addReplyLong(c,dictSize(c->pubsub_classes));
9318 }
9319 return retval;
9320 }
9321
9322 /* Unsubscribe from all the classes. Return the number of classes the
9323 * client was subscribed to. */
9324 static int pubsubUnsubscribeAll(redisClient *c, int notify) {
9325 dictIterator *di = dictGetIterator(c->pubsub_classes);
9326 dictEntry *de;
9327 int count = 0;
9328
9329 while((de = dictNext(di)) != NULL) {
9330 robj *class = dictGetEntryKey(de);
9331
9332 count += pubsubUnsubscribe(c,class,notify);
9333 }
9334 dictReleaseIterator(di);
9335 return count;
9336 }
9337
9338 /* Publish a message */
9339 static int pubsubPublishMessage(robj *class, robj *message) {
9340 int receivers = 0;
9341 struct dictEntry *de;
9342
9343 de = dictFind(server.pubsub_classes,class);
9344 if (de) {
9345 list *list = dictGetEntryVal(de);
9346 listNode *ln;
9347 listIter li;
9348
9349 listRewind(list,&li);
9350 while ((ln = listNext(&li)) != NULL) {
9351 redisClient *c = ln->value;
9352
9353 addReply(c,shared.mbulk3);
9354 addReply(c,shared.messagebulk);
9355 addReplyBulk(c,class);
9356 addReplyBulk(c,message);
9357 receivers++;
9358 }
9359 }
9360 return receivers;
9361 }
9362
9363 static void subscribeCommand(redisClient *c) {
9364 int j;
9365
9366 for (j = 1; j < c->argc; j++)
9367 pubsubSubscribe(c,c->argv[j]);
9368 }
9369
9370 static void unsubscribeCommand(redisClient *c) {
9371 if (c->argc == 1) {
9372 pubsubUnsubscribeAll(c,1);
9373 return;
9374 } else {
9375 int j;
9376
9377 for (j = 1; j < c->argc; j++)
9378 pubsubUnsubscribe(c,c->argv[j],1);
9379 }
9380 }
9381
9382 static void publishCommand(redisClient *c) {
9383 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9384 addReplyLong(c,receivers);
9385 }
9386
9387 /* ================================= Debugging ============================== */
9388
9389 static void debugCommand(redisClient *c) {
9390 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9391 *((char*)-1) = 'x';
9392 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9393 if (rdbSave(server.dbfilename) != REDIS_OK) {
9394 addReply(c,shared.err);
9395 return;
9396 }
9397 emptyDb();
9398 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9399 addReply(c,shared.err);
9400 return;
9401 }
9402 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9403 addReply(c,shared.ok);
9404 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9405 emptyDb();
9406 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9407 addReply(c,shared.err);
9408 return;
9409 }
9410 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9411 addReply(c,shared.ok);
9412 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9413 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9414 robj *key, *val;
9415
9416 if (!de) {
9417 addReply(c,shared.nokeyerr);
9418 return;
9419 }
9420 key = dictGetEntryKey(de);
9421 val = dictGetEntryVal(de);
9422 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9423 key->storage == REDIS_VM_SWAPPING)) {
9424 char *strenc;
9425 char buf[128];
9426
9427 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9428 strenc = strencoding[val->encoding];
9429 } else {
9430 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9431 strenc = buf;
9432 }
9433 addReplySds(c,sdscatprintf(sdsempty(),
9434 "+Key at:%p refcount:%d, value at:%p refcount:%d "
9435 "encoding:%s serializedlength:%lld\r\n",
9436 (void*)key, key->refcount, (void*)val, val->refcount,
9437 strenc, (long long) rdbSavedObjectLen(val,NULL)));
9438 } else {
9439 addReplySds(c,sdscatprintf(sdsempty(),
9440 "+Key at:%p refcount:%d, value swapped at: page %llu "
9441 "using %llu pages\r\n",
9442 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9443 (unsigned long long) key->vm.usedpages));
9444 }
9445 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9446 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9447 robj *key, *val;
9448
9449 if (!server.vm_enabled) {
9450 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9451 return;
9452 }
9453 if (!de) {
9454 addReply(c,shared.nokeyerr);
9455 return;
9456 }
9457 key = dictGetEntryKey(de);
9458 val = dictGetEntryVal(de);
9459 /* If the key is shared we want to create a copy */
9460 if (key->refcount > 1) {
9461 robj *newkey = dupStringObject(key);
9462 decrRefCount(key);
9463 key = dictGetEntryKey(de) = newkey;
9464 }
9465 /* Swap it */
9466 if (key->storage != REDIS_VM_MEMORY) {
9467 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9468 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9469 dictGetEntryVal(de) = NULL;
9470 addReply(c,shared.ok);
9471 } else {
9472 addReply(c,shared.err);
9473 }
9474 } else {
9475 addReplySds(c,sdsnew(
9476 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
9477 }
9478 }
9479
9480 static void _redisAssert(char *estr, char *file, int line) {
9481 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9482 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9483 #ifdef HAVE_BACKTRACE
9484 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9485 *((char*)-1) = 'x';
9486 #endif
9487 }
9488
9489 /* =================================== Main! ================================ */
9490
9491 #ifdef __linux__
9492 int linuxOvercommitMemoryValue(void) {
9493 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9494 char buf[64];
9495
9496 if (!fp) return -1;
9497 if (fgets(buf,64,fp) == NULL) {
9498 fclose(fp);
9499 return -1;
9500 }
9501 fclose(fp);
9502
9503 return atoi(buf);
9504 }
9505
9506 void linuxOvercommitMemoryWarning(void) {
9507 if (linuxOvercommitMemoryValue() == 0) {
9508 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9509 }
9510 }
9511 #endif /* __linux__ */
9512
9513 static void daemonize(void) {
9514 int fd;
9515 FILE *fp;
9516
9517 if (fork() != 0) exit(0); /* parent exits */
9518 setsid(); /* create a new session */
9519
9520 /* Every output goes to /dev/null. If Redis is daemonized but
9521 * the 'logfile' is set to 'stdout' in the configuration file
9522 * it will not log at all. */
9523 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9524 dup2(fd, STDIN_FILENO);
9525 dup2(fd, STDOUT_FILENO);
9526 dup2(fd, STDERR_FILENO);
9527 if (fd > STDERR_FILENO) close(fd);
9528 }
9529 /* Try to write the pid file */
9530 fp = fopen(server.pidfile,"w");
9531 if (fp) {
9532 fprintf(fp,"%d\n",getpid());
9533 fclose(fp);
9534 }
9535 }
9536
9537 static void version() {
9538 printf("Redis server version %s\n", REDIS_VERSION);
9539 exit(0);
9540 }
9541
9542 static void usage() {
9543 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9544 fprintf(stderr," ./redis-server - (read config from stdin)\n");
9545 exit(1);
9546 }
9547
9548 int main(int argc, char **argv) {
9549 time_t start;
9550
9551 initServerConfig();
9552 if (argc == 2) {
9553 if (strcmp(argv[1], "-v") == 0 ||
9554 strcmp(argv[1], "--version") == 0) version();
9555 if (strcmp(argv[1], "--help") == 0) usage();
9556 resetServerSaveParams();
9557 loadServerConfig(argv[1]);
9558 } else if ((argc > 2)) {
9559 usage();
9560 } else {
9561 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9562 }
9563 if (server.daemonize) daemonize();
9564 initServer();
9565 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9566 #ifdef __linux__
9567 linuxOvercommitMemoryWarning();
9568 #endif
9569 start = time(NULL);
9570 if (server.appendonly) {
9571 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9572 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9573 } else {
9574 if (rdbLoad(server.dbfilename) == REDIS_OK)
9575 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9576 }
9577 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
9578 aeSetBeforeSleepProc(server.el,beforeSleep);
9579 aeMain(server.el);
9580 aeDeleteEventLoop(server.el);
9581 return 0;
9582 }
9583
9584 /* ============================= Backtrace support ========================= */
9585
9586 #ifdef HAVE_BACKTRACE
9587 static char *findFuncName(void *pointer, unsigned long *offset);
9588
9589 static void *getMcontextEip(ucontext_t *uc) {
9590 #if defined(__FreeBSD__)
9591 return (void*) uc->uc_mcontext.mc_eip;
9592 #elif defined(__dietlibc__)
9593 return (void*) uc->uc_mcontext.eip;
9594 #elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9595 #if __x86_64__
9596 return (void*) uc->uc_mcontext->__ss.__rip;
9597 #else
9598 return (void*) uc->uc_mcontext->__ss.__eip;
9599 #endif
9600 #elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9601 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9602 return (void*) uc->uc_mcontext->__ss.__rip;
9603 #else
9604 return (void*) uc->uc_mcontext->__ss.__eip;
9605 #endif
9606 #elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
9607 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9608 #elif defined(__ia64__) /* Linux IA64 */
9609 return (void*) uc->uc_mcontext.sc_ip;
9610 #else
9611 return NULL;
9612 #endif
9613 }
9614
9615 static void segvHandler(int sig, siginfo_t *info, void *secret) {
9616 void *trace[100];
9617 char **messages = NULL;
9618 int i, trace_size = 0;
9619 unsigned long offset=0;
9620 ucontext_t *uc = (ucontext_t*) secret;
9621 sds infostring;
9622 REDIS_NOTUSED(info);
9623
9624 redisLog(REDIS_WARNING,
9625 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9626 infostring = genRedisInfoString();
9627 redisLog(REDIS_WARNING, "%s",infostring);
9628 /* It's not safe to sdsfree() the returned string under memory
9629 * corruption conditions. Let it leak as we are going to abort */
9630
9631 trace_size = backtrace(trace, 100);
9632 /* overwrite sigaction with caller's address */
9633 if (getMcontextEip(uc) != NULL) {
9634 trace[1] = getMcontextEip(uc);
9635 }
9636 messages = backtrace_symbols(trace, trace_size);
9637
9638 for (i=1; i<trace_size; ++i) {
9639 char *fn = findFuncName(trace[i], &offset), *p;
9640
9641 p = strchr(messages[i],'+');
9642 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9643 redisLog(REDIS_WARNING,"%s", messages[i]);
9644 } else {
9645 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9646 }
9647 }
9648 /* free(messages); Don't call free() with possibly corrupted memory. */
9649 _exit(0);
9650 }
9651
9652 static void setupSigSegvAction(void) {
9653 struct sigaction act;
9654
9655 sigemptyset (&act.sa_mask);
9656 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9657 * is used. Otherwise, sa_handler is used */
9658 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9659 act.sa_sigaction = segvHandler;
9660 sigaction (SIGSEGV, &act, NULL);
9661 sigaction (SIGBUS, &act, NULL);
9662 sigaction (SIGFPE, &act, NULL);
9663 sigaction (SIGILL, &act, NULL);
9664 sigaction (SIGBUS, &act, NULL);
9665 return;
9666 }
9667
9668 #include "staticsymbols.h"
9669 /* This function try to convert a pointer into a function name. It's used in
9670 * oreder to provide a backtrace under segmentation fault that's able to
9671 * display functions declared as static (otherwise the backtrace is useless). */
9672 static char *findFuncName(void *pointer, unsigned long *offset){
9673 int i, ret = -1;
9674 unsigned long off, minoff = 0;
9675
9676 /* Try to match against the Symbol with the smallest offset */
9677 for (i=0; symsTable[i].pointer; i++) {
9678 unsigned long lp = (unsigned long) pointer;
9679
9680 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9681 off=lp-symsTable[i].pointer;
9682 if (ret < 0 || off < minoff) {
9683 minoff=off;
9684 ret=i;
9685 }
9686 }
9687 }
9688 if (ret == -1) return NULL;
9689 *offset = minoff;
9690 return symsTable[ret].name;
9691 }
9692 #else /* HAVE_BACKTRACE */
9693 static void setupSigSegvAction(void) {
9694 }
9695 #endif /* HAVE_BACKTRACE */
9696
9697
9698
9699 /* The End */
9700
9701
9702